def check_diff(self): idx_list = [ idx.id for idx in History.select(History.id).order_by( History.id.desc()).limit(self.cfg.DiffCount()) ] ip_diff_add_sql = IP.select(fn.Count(fn.Distinct( IP.ip))).join(Item).where(IP.add == idx_list[0]).scalar() ip_diff_purge_sql = IP.select(fn.Count(fn.Distinct( IP.ip))).join(Item).where(IP.purge == idx_list[0]).scalar() domain_diff_add_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .join(Item).where(Domain.add == idx_list[0]).scalar() domain_diff_purge_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .join(Item).where(Domain.purge == idx_list[0]).scalar() url_diff_add_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\ .join(Item).where(URL.add == idx_list[0]).scalar() url_diff_purge_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\ .join(Item).where(URL.purge == idx_list[0]).scalar() if ip_diff_add_sql or ip_diff_purge_sql or domain_diff_add_sql or \ domain_diff_purge_sql or url_diff_add_sql or url_diff_purge_sql: History.update(dump=True).where( History.id == idx_list[0]).execute() return True else: # History.update(dump=False).where(History.id == idx_list[0]).execute() return False
def cleaner(self): logger.info('cleaner run') # history = History.select(History.id).order_by(History.id.desc()).limit(self.cfg.DiffCount()) # Item.delete().where(~(Item.purge << history)).execute() history_del = History.select(History.id).order_by(History.id.desc()).offset(self.cfg.DiffCount()) Item.delete().where(Item.purge << history_del).execute() IP.delete().where(IP.purge << history_del).execute() Domain.delete().where(Domain.purge << history_del).execute() URL.delete().where(URL.purge << history_del).execute()
def test_01_insert_init_data(self): """sample domain insertion test""" _domains = dict(domains) for name, url in _domains.items(): d = Domain(domain=name, url=url) d.save() for d in Domain.get_by_filters(): _domains.pop(d.domain, None) self.assertEqual(_domains, {})
def _domain_rollback_sql(self, rollback, bt): rb_list = self.idx_list[:rollback] if bt == 'ignore': domain_sql = Domain.select(fn.Distinct(Domain.domain))\ .where(~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list))) return domain_sql elif bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask': domain_sql = Domain.select(fn.Distinct(Domain.domain))\ .join(Item).where((Item.blockType == bt) & ~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list))) return domain_sql
def _domain_diff_sql(self, diff, bt, stat): if stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'): domain_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where(Item.blockType == bt, Domain.add == self.idx_list[diff]) return domain_sql elif not stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'): domain_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where(Item.blockType == bt, Domain.purge == self.idx_list[diff]) return domain_sql elif stat and bt == 'ignore': domain_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.add == self.idx_list[diff]) return domain_sql elif not stat and bt == 'ignore': domain_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.purge == self.idx_list[diff]) return domain_sql
def cleaner(self): private_nets = [ '0.%', '127.%', '192.168.%', '10.%', '172.16.%', '172.17.%', '172.18.%', '172.19.%', '172.20.%', '172.21.%', '172.22.%', '172.23.%', '172.24.%', '172.25.%', '172.26.%', '172.27.%', '172.28.%', '172.29.%', '172.30.%', '172.31.%' ] logger.info('Dump cleaner run') # history = History.select(History.id).order_by(History.id.desc()).limit(self.cfg.DiffCount()) # Item.delete().where(~(Item.purge << history)).execute() history_clear = History.select(History.id).order_by( History.id.desc()).offset(self.cfg.DiffCount()) item_del = Item.delete().where(Item.purge << history_clear).execute() logger.info('Item deleted: %d', item_del) ip_del = IP.delete().where(IP.purge << history_clear).execute() logger.info('IP deleted: %d', ip_del) domain_del = Domain.delete().where( Domain.purge << history_clear).execute() logger.info('Domain deleted: %d', domain_del) url_del = URL.delete().where(URL.purge << history_clear).execute() logger.info('URL deleted: %d', url_del) history_rm = History.select(History.id).order_by( History.id.desc()).offset(self.cfg.HistoryCount()) hist_del = History.delete().where(History.id << history_rm).execute() logger.info('History deleted: %d', hist_del) for net in private_nets: ip_count = IP.delete().where(IP.ip % net).execute() if ip_count: logger.info('IP error LIKE %s, count %d', net, ip_count)
def test_03_regexp(self): """regexp process test""" d = Domain(domain='dummy', url='dummy') d.save() s = Snapshot( domain_id=d.id, pulled_at=datetime.utcnow(), html="find me with a regexp!!", ) s.save() dc = DomainCheck(domain_id=d.id, name="dummy_check", regexp="find (me|you)") dc.save() run_regexp(snapshot_id=s.id) scd = SnapshotCheckData.get_by_filters(check_id=dc.id)[0] self.assertEqual(json.loads(scd.check_value), ["me"])
def test_02_collector(self): """collector process test""" done_snapshots = Snapshot.count_by_filters() for d in Domain.get_by_filters(): ok = run_collector(domain_id=d.id) if ok: done_snapshots += 1 total_snapshots = Snapshot.count_by_filters() self.assertEqual(done_snapshots, total_snapshots)
def run(self): """ Main function for thread that will """ # It might be good to create a process rather than a thread per file, and to create a multithreading environment # to process the list rather than a single thread for all of it (test under big file environment). # One json per line my_list = self.buf_file.splitlines() for line in my_list: # Ignoring if there is an empty entrance in the list if line: # load json json_line = json.loads(line) # we are going to look for creative_size, if it does not exist we will get the information from ad_width # and ad_height creative_size = find_key("creative_size", json_line) if not creative_size: value_width = find_key("ad_width", json_line) value_height = find_key("ad_height", json_line) if value_width and value_height: creative_size = [value_width[0] + "x" + value_height[0]] # We are going to look for the keys page_url and Referer referer = find_key("Referer", json_line) url = find_key("page_url", json_line) # If the three elements were found, introduce them in the DB. if creative_size and referer and url: with transaction() as session: added = False # Check the existence of the entrance before introducing a repetitive one if not session.query(Domain).filter(Domain.url==url[0]).first(): session.add(Domain(url[0])) added = True if not session.query(Referer).filter(Referer.url==referer[0]).first(): session.add(Referer(referer[0])) added = True session.flush() # If one of the previous tables has a new entry. No need to check of existence in here. if added: session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0])) elif not session.query(Information).filter(Information.domain_url==url[0])\ .filter(Information.referer_url==referer[0])\ .filter(Information.creative_size==creative_size[0]).first(): session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0])) print "Database updated with information from file: %s" % self.f_name
def check_diff(self): idx_list = [idx.id for idx in History.select(History.id).where(History.diff == True) .order_by(History.id.desc()).limit(self.cfg.DiffCount())] ip_diff_add_sql = IP.select(fn.Count(fn.Distinct(IP.ip))).join(Item).where(IP.add == idx_list[0]).scalar() ip_diff_purge_sql = IP.select(fn.Count(fn.Distinct(IP.ip))).join(Item).where(IP.purge == idx_list[0]).scalar() domain_diff_add_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .join(Item).where(Domain.add == idx_list[0]).scalar() domain_diff_purge_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .join(Item).where(Domain.purge == idx_list[0]).scalar() url_diff_add_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\ .join(Item).where(URL.add == idx_list[0]).scalar() url_diff_purge_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\ .join(Item).where(URL.purge == idx_list[0]).scalar() if ip_diff_add_sql or ip_diff_purge_sql or domain_diff_add_sql or \ domain_diff_purge_sql or url_diff_add_sql or url_diff_purge_sql: return True else: History.update(diff=False).where(History.id == idx_list[0]).execute() return False
def run(*args, domain_id='', **kwargs): """Collector execution, get domains HTML running an HTTP GET request Keyword arguments: domain_id -- domain_id that should be retrieved """ logging.info("start collector run") domain = Domain.get_by_filters(id=domain_id) if domain: domain = domain[0] headers = { #custom headers } proxies = { #custom proxies } params = { #custom query string } res = None try: res = requests.get( domain.url, headers=headers, proxies=proxies, data=params, timeout= 10 #avoid hangs! -> https://requests.readthedocs.io/en/master/user/advanced/#timeouts ) except: #bad things can happen with users' input and i/o operations #here a good error callback function will be super! logging.exception(f"Error fething {domain.url}") snapshot_id = None if res: s = Snapshot(domain_id=domain_id, pulled_at=datetime.utcnow(), html=res.text) s.save() domain.last_snapshot_at = datetime.utcnow() domain.save() snapshot_id = s.id SnapshotMetadata( snapshot_id=s.id, domain_id=domain_id, request_time=res.elapsed.total_seconds(), request_status=res.status_code, ).save() return snapshot_id
def message_handler(self, channel, method, properties, body): if self.verbose: logger.info("received %r" % (body,)) message = json.loads(body) domain = Domain.normalize(message.get('domain')) response = message.get('response') query_time = message.get('time', time.time()) # parse dns message if there was one if response: try: response = wdns.parse_message(response.decode('base64')) except wdns.MessageParseException, e: response = None
def message_handler(self, channel, method, properties, body): if self.verbose: logger.info("received %r" % (body, )) message = json.loads(body) domain = Domain.normalize(message.get('domain')) response = message.get('response') query_time = message.get('time', time.time()) # parse dns message if there was one if response: try: response = wdns.parse_message(response.decode('base64')) except wdns.MessageParseException, e: response = None
def statistics_show(self, diff=0, stdout=False): date_time = datetime.fromtimestamp(int(Dump.get(Dump.param == 'lastDumpDate') .value)).strftime('%Y-%m-%d %H:%M:%S') message = 'vigruzki.rkn.gov.ru update: ' + date_time + '\n' url_add_sql = self._url_diff_sql(diff, 'ignore', 1) message += '\nURLs added: \n\n' for url_add in url_add_sql: message += url_add.url + '\n' ip_add_sql = self._ip_diff_sql(diff, 'ignore', 1) message += '\nIPs added: \n\n' for ip_add in ip_add_sql: if ip_add.mask < 32: message += ip_add.ip + '/' + str(ip_add.mask) else: message += ip_add.ip + '\n' domain_add_sql = self._domain_diff_sql(diff, 'ignore', 1) message += '\nDOMAINs added: \n\n' for domain_add in domain_add_sql: message += domain_add.domain + '\n' url_del_sql = self._url_diff_sql(diff, 'ignore', 0) message += '\nURLs deleted: \n\n' for url_del in url_del_sql: message += url_del.url + '\n' ip_del_sql = self._ip_diff_sql(diff, 'ignore', 0) message += '\nIPs deleted: \n\n' for ip_del in ip_del_sql: if ip_del.mask < 32: message += ip_del.ip + '/' + str(ip_del.mask) else: message += ip_del.ip + '\n' domain_del_sql = self._domain_diff_sql(diff, 'ignore', 0) message += '\nDOMAINs deleted: \n\n' for domain_del in domain_del_sql: message += domain_del.domain + '\n' rb_list = self.idx_list[:diff] domain_count = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .where(~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list))).scalar() url_count = URL.select(fn.Count(fn.Distinct(URL.url)))\ .where(~(URL.add << rb_list) & ((URL.purge >> None) | (URL.purge << rb_list))).scalar() ip_count = IP.select(fn.Count(fn.Distinct(IP.ip)))\ .where(~(IP.add << rb_list) & ((IP.purge >> None) | (IP.purge << rb_list))).scalar() id_count = Item.select(fn.Count(fn.Distinct(Item.content_id)))\ .where(~(Item.add << rb_list) & ((Item.purge >> None) | (Item.purge << rb_list))).scalar() message += '\nURLs count: ' + str(url_count) + '\n' message += 'IPs count: ' + str(ip_count) + '\n' message += 'DOMAINs count: ' + str(domain_count) + '\n' message += 'Item count: ' + str(id_count) + '\n' if stdout: print(message) return False else: return message
def parse_dump(self): if not os.path.exists(self.path_py + '/dump.xml'): logger.info('dump.xml not found: s%', self.path_py + '/dump.xml') return 0 logger.info('dump.xml already exists.') tree_xml = ElementTree().parse(self.path_py + '/dump.xml') dt = datetime.strptime(tree_xml.attrib['updateTime'][:19], '%Y-%m-%dT%H:%M:%S') update_time = int(time.mktime(dt.timetuple())) Dump.update(value=update_time).where( Dump.param == 'lastDumpDate').execute() logger.info('Got updateTime: %s.', update_time) dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19], '%Y-%m-%dT%H:%M:%S') update_time_urgently = int(time.mktime(dt.timetuple())) Dump.update(value=update_time_urgently).where( Dump.param == 'lastDumpDateUrgently').execute() logger.info('Got updateTimeUrgently: %s.', update_time_urgently) list_xml = tree_xml.findall(".//*[@id]") id_set_dump = set() id_set_db = set() for content_xml in list_xml: # print(content_xml.tag, content_xml.attrib, content_xml.text) id_set_dump.add(int(content_xml.attrib['id'])) select_content_id_db = Item.select( Item.content_id).where(Item.purge >> None) for content_db in select_content_id_db: id_set_db.add(content_db.content_id) common_id_set = id_set_dump.intersection(id_set_db) delete_id_set = id_set_db.difference(common_id_set) add_id_set = id_set_dump.difference(common_id_set) # print(delete_id_set) # print(add_id_set) if len(delete_id_set) > 0: with self.transact.atomic(): for del_item in delete_id_set: logger.info('Full delete Item, IP, Domain, URL id: %s.', del_item) Item.update(purge=self.code_id).where( Item.content_id == del_item, Item.purge >> None).execute() Domain.update(purge=self.code_id).where( Domain.content_id == del_item, Domain.purge >> None).execute() URL.update(purge=self.code_id).where( URL.content_id == del_item, URL.purge >> None).execute() IP.update(purge=self.code_id).where( IP.content_id == del_item, IP.purge >> None).execute() if len(add_id_set) > 0: include_time = str() urgency_type = int() entry_type = int() block_type = str() hash_value = str() with self.transact.atomic(): for new_item in add_id_set: logger.info('New Item, IP, Domain, URL id: %s.', new_item) new_item_xml = tree_xml.find(".//content[@id='" + str(new_item) + "']") for data_xml in new_item_xml.iter(): if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) try: urgency_type = int( data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db( data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) hash_value = data_xml.attrib['hash'] if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] item_new = Item(content_id=content_id, includeTime=include_time, urgencyType=urgency_type, entryType=entry_type, blockType=block_type, hashRecord=hash_value, decision_date=decision_date, decision_num=decision_number, decision_org=decision_org, add=self.code_id) item_new.save() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote( url_split[1]) else: url = data_xml.text URL.create(item=item_new.id, content_id=content_id, url=url, add=self.code_id) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str( data_xml.text).encode('idna')).decode() else: domain = data_xml.text Domain.create(item=item_new.id, content_id=content_id, domain=domain, add=self.code_id) if data_xml.tag == 'ip': ip = data_xml.text IP.create(item=item_new.id, content_id=content_id, ip=ip, add=self.code_id) if data_xml.tag == 'ipSubnet': net = data_xml.text.split('/') ip = net[0] mask = net[1] IP.create(item=item_new.id, content_id=content_id, ip=ip, mask=mask, add=self.code_id) url_db_set = set() url_xml_set = set() ip_db_set = set() ip_xml_set = set() sub_ip_xml_set = set() sub_ip_db_set = set() domain_db_set = set() domain_xml_set = set() data_update = False with self.transact.atomic(): for item_xml in list_xml: for data_xml in item_xml.iter(): # print(data_xml.tag, data_xml.attrib, data_xml.text) if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) hash_value = data_xml.attrib['hash'] item_db = Item.get(Item.content_id == content_id, Item.purge >> None) if hash_value != item_db.hashRecord: logger.info('Hashes not equal, update hash id: %s', content_id) try: urgency_type = int( data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db( data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) item_db.hashRecord = hash_value # Item.update(purge=None).where(Item.content_id == content_id).execute() data_update = True else: data_update = False break if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] # print(item_db) if str(item_db.includeTime) != include_time: logger.info('content_id: %s.', content_id) logger.info('XML includeTime: %s.', include_time) logger.info('DB includeTime: %s.', item_db.includeTime) item_db.includeTime = include_time # Item.update(includeTime=include_time).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.urgencyType != urgency_type: logger.info('content_id: %s.', content_id) logger.info('XML urgencyType: %s.', urgency_type) logger.info('DB urgencyType: %s.', item_db.urgencyType) item_db.urgencyType = urgency_type # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.blockType != block_type: logger.info('content_id: %s.', content_id) logger.info('XML blockType: %s.', block_type) logger.info('DB blockType: %s.', item_db.blockType) item_db.blockType = block_type # Item.update(blockType=block_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.entryType != entry_type: logger.info('content_id: %s.', content_id) logger.info('XML entryType: %s.', entry_type) logger.info('DB entryType: %s.', item_db.entryType) item_db.entryType = entry_type # Item.update(entryType=entry_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if str(item_db.decision_date) != decision_date: logger.info('content_id: %s.', content_id) logger.info('XML date: %s.', decision_date) logger.info('DB date: %s.', str(item_db.decision_date)) item_db.decision_date = decision_date # Item.update(decision_date=decision_date).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_num != decision_number: logger.info('content_id: %s.', content_id) logger.info('XML number: %s.', decision_number) logger.info('DB number: %s.', item_db.decision_num) item_db.decision_num = decision_number # Item.update(decision_num=decision_number).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_org != decision_org: logger.info('content_id: %s.', content_id) logger.info('XML org: %s.', decision_org) logger.info('DB org: %s.', item_db.decision_org) item_db.decision_org = decision_org # Item.update(decision_org=decision_org).where(Item.content_id == content_id, # Item.purge >> None).execute() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote( url_split[1]) else: url = data_xml.text url_xml_set.add(url) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str( data_xml.text).encode('idna')).decode() else: domain = data_xml.text domain_xml_set.add(domain) if data_xml.tag == 'ip': ip_xml_set.add(data_xml.text) if data_xml.tag == 'ipSubnet': sub_ip_xml_set.add(data_xml.text) if data_update: url_db = URL.select().where(URL.item == item_db.id, URL.purge >> None) for url_item in url_db: url_db_set.add(url_item.url) if url_db_set != url_xml_set: common_url_set = url_xml_set.intersection(url_db_set) delete_url_set = url_db_set.difference(common_url_set) add_url_set = url_xml_set.difference(common_url_set) if len(delete_url_set) > 0: logger.info('Delete id %s URL: %s', content_id, delete_url_set) for delete_url in delete_url_set: URL.update(purge=self.code_id).where( URL.item == item_db.id, URL.url == delete_url, URL.purge >> None).execute() if len(add_url_set) > 0: logger.info('Add id %s URL: %s', content_id, add_url_set) for add_url in add_url_set: URL.create(item=item_db.id, content_id=item_db.content_id, url=add_url, add=self.code_id) url_db_set.clear() url_xml_set.clear() domain_db = Domain.select().where( Domain.item == item_db.id, Domain.purge >> None) for domain_item in domain_db: domain_db_set.add(domain_item.domain) if domain_db_set != domain_xml_set: common_domain_set = domain_xml_set.intersection( domain_db_set) delete_domain_set = domain_db_set.difference( common_domain_set) add_domain_set = domain_xml_set.difference( common_domain_set) if len(delete_domain_set) > 0: logger.info('Delete id %s Domain: %s', content_id, delete_domain_set) for delete_domain in delete_domain_set: Domain.update(purge=self.code_id).where( Domain.item == item_db.id, Domain.domain == delete_domain, Domain.purge >> None).execute() if len(add_domain_set) > 0: logger.info('Add id %s Domain: %s', content_id, add_domain_set) for add_domain in add_domain_set: Domain.create(item=item_db.id, content_id=item_db.content_id, domain=add_domain, add=self.code_id) domain_db_set.clear() domain_xml_set.clear() ip_db = IP.select().where(IP.item == item_db.id, IP.mask == 32, IP.purge >> None) for ip_item in ip_db: ip_db_set.add(ip_item.ip) if ip_db_set != ip_xml_set: common_ip_set = ip_xml_set.intersection(ip_db_set) delete_ip_set = ip_db_set.difference(common_ip_set) add_ip_set = ip_xml_set.difference(common_ip_set) if len(delete_ip_set) > 0: logger.info('Delete id %s ip: %s', content_id, delete_ip_set) for delete_ip in delete_ip_set: IP.update(purge=self.code_id).where( IP.item == item_db.id, IP.ip == delete_ip, IP.mask == 32, IP.purge >> None).execute() if len(add_ip_set) > 0: logger.info('Add id %s ip: %s', content_id, add_ip_set) for add_ip in add_ip_set: IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, add=self.code_id) ip_db_set.clear() ip_xml_set.clear() sub_ip_db = IP.select().where(IP.item == item_db.id, IP.mask < 32, IP.purge >> None) for sub_ip_item in sub_ip_db: sub_ip_db_set.add( str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask)) if sub_ip_db_set != sub_ip_xml_set: common_sub_ip_set = sub_ip_xml_set.intersection( sub_ip_db_set) delete_sub_ip_set = sub_ip_db_set.difference( common_sub_ip_set) add_sub_ip_set = sub_ip_xml_set.difference( common_sub_ip_set) if len(delete_sub_ip_set) > 0: logger.info('Delete id %s subnet: %s', content_id, delete_sub_ip_set) for delete_sub_ip in delete_sub_ip_set: del_subnet = str(delete_sub_ip).split('/') del_ip = del_subnet[0] del_mask = del_subnet[1] IP.update(purge=self.code_id).where( IP.item == item_db.id, IP.ip == del_ip, IP.mask == del_mask, IP.purge >> None).execute() if len(add_sub_ip_set) > 0: logger.info('Add id %s subnet: %s', content_id, add_sub_ip_set) for add_sub_ip in add_sub_ip_set: add_subnet = str(add_sub_ip).split('/') add_ip = add_subnet[0] add_mask = add_subnet[1] IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, mask=add_mask, add=self.code_id) item_db.save() sub_ip_db_set.clear() sub_ip_xml_set.clear() if self.check_diff(): self.cleaner() return 1 else: logger.info('no updates') # print('no updates') return 2
def parse_dump(self): if not os.path.exists(self.path_py + '/dump.xml'): logger.info('dump.xml not found: s%', self.path_py + '/dump.xml') return 0 logger.info('dump.xml already exists.') tree_xml = ElementTree().parse(self.path_py + '/dump.xml') dt = datetime.strptime(tree_xml.attrib['updateTime'][:19], '%Y-%m-%dT%H:%M:%S') update_time = int(time.mktime(dt.timetuple())) Dump.update(value=update_time).where(Dump.param == 'lastDumpDate').execute() logger.info('Got updateTime: %s.', update_time) dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19], '%Y-%m-%dT%H:%M:%S') update_time_urgently = int(time.mktime(dt.timetuple())) Dump.update(value=update_time_urgently).where(Dump.param == 'lastDumpDateUrgently').execute() logger.info('Got updateTimeUrgently: %s.', update_time_urgently) list_xml = tree_xml.findall(".//*[@id]") id_set_dump = set() id_set_db = set() for content_xml in list_xml: # print(content_xml.tag, content_xml.attrib, content_xml.text) id_set_dump.add(int(content_xml.attrib['id'])) select_content_id_db = Item.select(Item.content_id).where(Item.purge >> None) for content_db in select_content_id_db: id_set_db.add(content_db.content_id) common_id_set = id_set_dump.intersection(id_set_db) delete_id_set = id_set_db.difference(common_id_set) add_id_set = id_set_dump.difference(common_id_set) # print(delete_id_set) # print(add_id_set) if len(delete_id_set) > 0: with self.transact.atomic(): for del_item in delete_id_set: logger.info('Full delete Item, IP, Domain, URL id: %s.', del_item) Item.update(purge=self.code_id).where(Item.content_id == del_item, Item.purge >> None).execute() Domain.update(purge=self.code_id).where(Domain.content_id == del_item, Domain.purge >> None).execute() URL.update(purge=self.code_id).where(URL.content_id == del_item, URL.purge >> None).execute() IP.update(purge=self.code_id).where(IP.content_id == del_item, IP.purge >> None).execute() if len(add_id_set) > 0: include_time = str() urgency_type = int() entry_type = int() block_type = str() hash_value = str() with self.transact.atomic(): for new_item in add_id_set: logger.info('New Item, IP, Domain, URL id: %s.', new_item) new_item_xml = tree_xml.find(".//content[@id='" + str(new_item) + "']") for data_xml in new_item_xml.iter(): if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) try: urgency_type = int(data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) hash_value = data_xml.attrib['hash'] if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] item_new = Item(content_id=content_id, includeTime=include_time, urgencyType=urgency_type, entryType=entry_type, blockType=block_type, hashRecord=hash_value, decision_date=decision_date, decision_num=decision_number, decision_org=decision_org, add=self.code_id) item_new.save() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote(url_split[1]) else: url = data_xml.text URL.create(item=item_new.id, content_id=content_id, url=url, add=self.code_id) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str(data_xml.text).encode('idna')).decode() else: domain = data_xml.text Domain.create(item=item_new.id, content_id=content_id, domain=domain, add=self.code_id) if data_xml.tag == 'ip': ip = data_xml.text IP.create(item=item_new.id, content_id=content_id, ip=ip, add=self.code_id) if data_xml.tag == 'ipSubnet': net = data_xml.text.split('/') ip = net[0] mask = net[1] IP.create(item=item_new.id, content_id=content_id, ip=ip, mask=mask, add=self.code_id) url_db_set = set() url_xml_set = set() ip_db_set = set() ip_xml_set = set() sub_ip_xml_set = set() sub_ip_db_set = set() domain_db_set = set() domain_xml_set = set() data_update = False with self.transact.atomic(): for item_xml in list_xml: for data_xml in item_xml.iter(): # print(data_xml.tag, data_xml.attrib, data_xml.text) if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) hash_value = data_xml.attrib['hash'] item_db = Item.get(Item.content_id == content_id, Item.purge >> None) if hash_value != item_db.hashRecord: logger.info('Hashes not equal, update hash id: %s', content_id) try: urgency_type = int(data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) item_db.hashRecord = hash_value # Item.update(purge=None).where(Item.content_id == content_id).execute() data_update = True else: data_update = False break if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] # print(item_db) if str(item_db.includeTime) != include_time: logger.info('content_id: %s.', content_id) logger.info('XML includeTime: %s.', include_time) logger.info('DB includeTime: %s.', item_db.includeTime) item_db.includeTime = include_time # Item.update(includeTime=include_time).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.urgencyType != urgency_type: logger.info('content_id: %s.', content_id) logger.info('XML urgencyType: %s.', urgency_type) logger.info('DB urgencyType: %s.', item_db.urgencyType) item_db.urgencyType = urgency_type # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.blockType != block_type: logger.info('content_id: %s.', content_id) logger.info('XML blockType: %s.', block_type) logger.info('DB blockType: %s.', item_db.blockType) item_db.blockType = block_type # Item.update(blockType=block_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.entryType != entry_type: logger.info('content_id: %s.', content_id) logger.info('XML entryType: %s.', entry_type) logger.info('DB entryType: %s.', item_db.entryType) item_db.entryType = entry_type # Item.update(entryType=entry_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if str(item_db.decision_date) != decision_date: logger.info('content_id: %s.', content_id) logger.info('XML date: %s.', decision_date) logger.info('DB date: %s.', str(item_db.decision_date)) item_db.decision_date = decision_date # Item.update(decision_date=decision_date).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_num != decision_number: logger.info('content_id: %s.', content_id) logger.info('XML number: %s.', decision_number) logger.info('DB number: %s.', item_db.decision_num) item_db.decision_num = decision_number # Item.update(decision_num=decision_number).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_org != decision_org: logger.info('content_id: %s.', content_id) logger.info('XML org: %s.', decision_org) logger.info('DB org: %s.', item_db.decision_org) item_db.decision_org = decision_org # Item.update(decision_org=decision_org).where(Item.content_id == content_id, # Item.purge >> None).execute() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote(url_split[1]) else: url = data_xml.text url_xml_set.add(url) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str(data_xml.text).encode('idna')).decode() else: domain = data_xml.text domain_xml_set.add(domain) if data_xml.tag == 'ip': ip_xml_set.add(data_xml.text) if data_xml.tag == 'ipSubnet': sub_ip_xml_set.add(data_xml.text) if data_update: url_db = URL.select().where(URL.item == item_db.id, URL.purge >> None) for url_item in url_db: url_db_set.add(url_item.url) if url_db_set != url_xml_set: common_url_set = url_xml_set.intersection(url_db_set) delete_url_set = url_db_set.difference(common_url_set) add_url_set = url_xml_set.difference(common_url_set) if len(delete_url_set) > 0: logger.info('Delete id %s URL: %s', content_id, delete_url_set) for delete_url in delete_url_set: URL.update(purge=self.code_id).where(URL.item == item_db.id, URL.url == delete_url, URL.purge >> None).execute() if len(add_url_set) > 0: logger.info('Add id %s URL: %s', content_id, add_url_set) for add_url in add_url_set: URL.create(item=item_db.id, content_id=item_db.content_id, url=add_url, add=self.code_id) url_db_set.clear() url_xml_set.clear() domain_db = Domain.select().where(Domain.item == item_db.id, Domain.purge >> None) for domain_item in domain_db: domain_db_set.add(domain_item.domain) if domain_db_set != domain_xml_set: common_domain_set = domain_xml_set.intersection(domain_db_set) delete_domain_set = domain_db_set.difference(common_domain_set) add_domain_set = domain_xml_set.difference(common_domain_set) if len(delete_domain_set) > 0: logger.info('Delete id %s Domain: %s', content_id, delete_domain_set) for delete_domain in delete_domain_set: Domain.update(purge=self.code_id).where(Domain.item == item_db.id, Domain.domain == delete_domain, Domain.purge >> None).execute() if len(add_domain_set) > 0: logger.info('Add id %s Domain: %s', content_id, add_domain_set) for add_domain in add_domain_set: Domain.create(item=item_db.id, content_id=item_db.content_id, domain=add_domain, add=self.code_id) domain_db_set.clear() domain_xml_set.clear() ip_db = IP.select().where(IP.item == item_db.id, IP.mask == 32, IP.purge >> None) for ip_item in ip_db: ip_db_set.add(ip_item.ip) if ip_db_set != ip_xml_set: common_ip_set = ip_xml_set.intersection(ip_db_set) delete_ip_set = ip_db_set.difference(common_ip_set) add_ip_set = ip_xml_set.difference(common_ip_set) if len(delete_ip_set) > 0: logger.info('Delete id %s ip: %s', content_id, delete_ip_set) for delete_ip in delete_ip_set: IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == delete_ip, IP.mask == 32, IP.purge >> None).execute() if len(add_ip_set) > 0: logger.info('Add id %s ip: %s', content_id, add_ip_set) for add_ip in add_ip_set: IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, add=self.code_id) ip_db_set.clear() ip_xml_set.clear() sub_ip_db = IP.select().where(IP.item == item_db.id, IP.mask < 32, IP.purge >> None) for sub_ip_item in sub_ip_db: sub_ip_db_set.add(str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask)) if sub_ip_db_set != sub_ip_xml_set: common_sub_ip_set = sub_ip_xml_set.intersection(sub_ip_db_set) delete_sub_ip_set = sub_ip_db_set.difference(common_sub_ip_set) add_sub_ip_set = sub_ip_xml_set.difference(common_sub_ip_set) if len(delete_sub_ip_set) > 0: logger.info('Delete id %s subnet: %s', content_id, delete_sub_ip_set) for delete_sub_ip in delete_sub_ip_set: del_subnet = str(delete_sub_ip).split('/') del_ip = del_subnet[0] del_mask = del_subnet[1] IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == del_ip, IP.mask == del_mask, IP.purge >> None).execute() if len(add_sub_ip_set) > 0: logger.info('Add id %s subnet: %s', content_id, add_sub_ip_set) for add_sub_ip in add_sub_ip_set: add_subnet = str(add_sub_ip).split('/') add_ip = add_subnet[0] add_mask = add_subnet[1] IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, mask=add_mask, add=self.code_id) item_db.save() sub_ip_db_set.clear() sub_ip_xml_set.clear() if self.check_diff(): self.cleaner() return 1 else: logger.info('no updates') self.cleaner() return 2
def _domain_dedup_sql(self, diff, bt, stat): rb_list_add = self.idx_list[:diff+1] rb_list_purge = self.idx_list[:diff] if stat and bt == 'ignore': domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.add == self.idx_list[diff]) domain_dup_sql = Domain.select(fn.Distinct(Domain.domain))\ .where(~(Domain.add << rb_list_add) & ((Domain.purge >> None) | (Domain.purge << rb_list_add)) & (Domain.domain << domain_diff_sql)) domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).where((Domain.add == self.idx_list[diff]) & ~(Domain.domain << domain_dup_sql)) return domain_dedup_sql elif not stat and bt == 'ignore': domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.purge == self.idx_list[diff]) domain_dup_sql = Domain.select(fn.Distinct(Domain.domain))\ .where(~(Domain.add << rb_list_purge) & (Domain.purge >> None) & (Domain.domain << domain_diff_sql)) domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).where((Domain.purge == self.idx_list[diff]) & ~(Domain.domain << domain_dup_sql)) return domain_dedup_sql elif stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'): domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where(Item.blockType == bt, Domain.add == self.idx_list[diff]) domain_dup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where((Item.blockType == bt) & ~(Domain.add << rb_list_add) & (Domain.purge >> None) & (Domain.domain << domain_diff_sql)) domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where((Item.blockType == bt) & (Domain.add == self.idx_list[diff]) & ~(Domain.domain << domain_dup_sql)) return domain_dedup_sql elif not stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'): domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where(Item.blockType == bt, Domain.purge == self.idx_list[diff]) domain_dup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where((Item.blockType == bt) & ~(Domain.add << rb_list_purge) & (Domain.purge >> None) & (Domain.domain << domain_diff_sql)) domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\ .where((Item.blockType == bt) & (Domain.purge == self.idx_list[diff]) & ~(Domain.domain << domain_dup_sql)) return domain_dedup_sql
if __name__ == "__main__": domains = { "helsinkitimes": "https://www.helsinkitimes.fi/", "berlin": "https://www.berlin.de/en/news/", "9news": "https://www.9news.com.au/sydney", "fail!": "fail://fail.com", } domain_checks = { "helsinkitimes": ["covid(\\d+) ", "govern(\\w+)"], } delete_tables() create_tables() domain_ids = [] for name, url in domains.items(): d = Domain(domain=name, url=url) d.save() domain_ids.append(d.id) for idx, regexp in enumerate(domain_checks.get(name, [])): DomainCheck( domain_id=d.id, name=f"{name}-{idx}", regexp=regexp, ).save() for x in range(n_pulls): for d_id in domain_ids: print(f"sending collector task for {d_id}") send_task(topic=collector_topic, domain_id=d_id) time.sleep(sleep_between_pulls)