def send_request(self): logger.info('Sending request.') request = self.session.sendRequest(self.cfg.XMLPathFName(), self.cfg.P7SPathFName()) logger.info('Checking request status.') if request['result']: self.code = request['code'] logger.info('Got code %s', self.code) Dump.update(value=self.code).where( Dump.param == 'lastCode').execute() Dump.update(value='sendRequest').where( Dump.param == 'lastAction').execute() Dump.update(value='Code').where( Dump.param == 'lastResult').execute() logger.info('Save code in History') History.create(requestCode=self.code, date=datetime.now().strftime('%Y-%m-%d %H:%M:%S')) self.code_id = History.get(History.requestCode == self.code).id return self.code else: Dump.update(value='sendRequest').where( Dump.param == 'lastAction').execute() Dump.update(value='Error').where( Dump.param == 'lastResult').execute() logger.error(request['resultComment']) return False
def get_request(self): path_py = str(os.path.dirname(os.path.abspath(__file__))) logger.info('Waiting for a 90 sec.') time.sleep(90) logger.info('Trying to get result...') request = self.session.getResult(self.code) Dump.update(value='getRequest').where( Dump.param == 'lastAction').execute() max_count = self.cfg.GetResultMaxCount() for count in range(1, max_count + 1): if request['result']: logger.info('Got a dump ver. %s for the %s (INN %s)', request['dumpFormatVersion'], request['operatorName'], request['inn']) with open(path_py + '/result.zip', "wb") as f: f.write(b64decode(request['registerZipArchive'])) logger.info( 'Downloaded dump %d bytes, MD5 hashsum: %s', os.path.getsize(path_py + '/result.zip'), hashlib.md5(open(path_py + '/result.zip', 'rb').read()).hexdigest()) try: logger.info('Unpacking.') zip_file = zipfile.ZipFile(path_py + '/result.zip', 'r') zip_file.extract('dump.xml', path_py + '/') if self.cfg.DumpFileSave(): zip_file.extractall( path_py + '/dumps/%s' % datetime.now().strftime("%Y-%m-%d %H-%M-%S")) zip_file.close() except zipfile.BadZipfile: logger.error('Wrong file format.') Dump.update(value='Error').where( Dump.param == 'lastResult').execute() return False Dump.update(value='Ok').where( Dump.param == 'lastResult').execute() return True else: if not request['resultCode']: logger.info( 'Not ready yet. Waiting for a minute. Attempt number %s', count) time.sleep(60) else: logger.error('Got an error, code %d: %s', request['resultCode'], request['resultComment']) Dump.update(value='Error').where( Dump.param == 'lastResult').execute() return False Dump.update(value='Error').where(Dump.param == 'lastResult').execute() # History.update(dump=False).where(History.id == self.code_id).execute() logger.info('Cant get result.') return False
def get_request(self): path_py = str(os.path.dirname(os.path.abspath(__file__))) logger.info('Waiting for a 90 sec.') time.sleep(90) logger.info('Trying to get result...') request = self.session.getResult(self.code) Dump.update(value='getRequest').where(Dump.param == 'lastAction').execute() max_count = self.cfg.GetResultMaxCount() for count in range(1, max_count + 1): if request['result']: logger.info('Got a dump ver. %s for the %s (INN %s)', request['dumpFormatVersion'], request['operatorName'], request['inn']) with open(path_py + '/result.zip', "wb") as f: f.write(b64decode(request['registerZipArchive'])) logger.info('Downloaded dump %d bytes, MD5 hashsum: %s', os.path.getsize(path_py + '/result.zip'), hashlib.md5(open(path_py + '/result.zip', 'rb') .read()).hexdigest()) try: logger.info('Unpacking.') zip_file = zipfile.ZipFile(path_py + '/result.zip', 'r') zip_file.extract('dump.xml', path_py + '/') if self.cfg.DumpFileSave(): zip_file.extractall(path_py + '/dumps/%s' % datetime.now().strftime("%Y-%m-%d %H-%M-%S")) zip_file.close() except zipfile.BadZipfile: logger.error('Wrong file format.') Dump.update(value='Error').where(Dump.param == 'lastResult').execute() return False Dump.update(value='Ok').where(Dump.param == 'lastResult').execute() return True else: if not request['resultCode']: logger.info('Not ready yet. Waiting for a minute. Attempt number %s', count) time.sleep(60) else: logger.error('Got an error, code %d: %s', request['resultCode'], request['resultComment']) Dump.update(value='Error').where(Dump.param == 'lastResult').execute() return False Dump.update(value='Error').where(Dump.param == 'lastResult').execute() logger.info('Cant get result.') return False
def send_request(self): logger.info('Sending request.') request = self.session.sendRequest(self.cfg.XMLPathFName(), self.cfg.P7SPathFName()) logger.info('Checking request status.') if request['result']: self.code = request['code'] logger.info('Got code %s', self.code) Dump.update(value=self.code).where(Dump.param == 'lastCode').execute() Dump.update(value='sendRequest').where(Dump.param == 'lastAction').execute() Dump.update(value='Code').where(Dump.param == 'lastResult').execute() logger.info('Save code in History') History.create(requestCode=self.code, date=datetime.now().strftime('%Y-%m-%d %H:%M:%S')) self.code_id = History.get(History.requestCode == self.code).id return self.code else: Dump.update(value='sendRequest').where(Dump.param == 'lastAction').execute() Dump.update(value='Error').where(Dump.param == 'lastResult').execute() logger.error(request['resultComment']) return False
def run(): log.info('Start load genres.') genre_translate = load() NEED_SMS = True new_genres = [] is_first_run = not genre_translate log.info(f'Current genres: {len(genre_translate)}') for genre in Dump.get_all_genres(): if genre not in genre_translate: log.info(f'Added new genre: {genre!r}') genre_translate[genre] = None new_genres.append(genre) if new_genres: text = f"Added genres ({len(new_genres)}): {', '.join(new_genres)}" log.info(text) # Если это первый запуск, то смс не отправляем if not is_first_run: if NEED_SMS: send_sms(text, log=log) log.info('Save genres') json.dump( genre_translate, open(FILE_NAME_GENRE_TRANSLATE, 'w', encoding='utf-8'), ensure_ascii=False, indent=4 ) else: log.info('No new genres') log.info('Finish!')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' from collections import defaultdict from db import Dump print('Total:', Dump.select().count()) genres = Dump.get_all_genres() print(f'Genres ({len(genres)}): {genres}') games = Dump.get_all_games() print(f'Games ({len(games)}): {games}') sites = Dump.get_all_sites() print(f'Sites ({len(sites)}): {sites}') print() max_width = max(len(x.site) for x in Dump.select(Dump.site).distinct()) fmt_str = ' {:<%d} : {}' % max_width game_by_dump = defaultdict(list) for x in Dump.get(): game_by_dump[x.name].append(x) for game, dumps in game_by_dump.items(): print(game)
def check_new_dump(self): logger.info('Check if dump.xml has updates since last sync.') last_date_dump = max(self.update_dump.lastDumpDate // 1000, self.update_dump.lastDumpDateUrgently // 1000) current_date_dump = max(int(Dump.get(Dump.param == 'lastDumpDate').value), int(Dump.get(Dump.param == 'lastDumpDateUrgently').value)) logger.info('Current date: lastDumpDate: %s, lastDumpDateUrgently: %s', datetime.fromtimestamp(int(Dump.get(Dump.param == 'lastDumpDate').value)) .strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp(int(Dump.get(Dump.param == 'lastDumpDateUrgently').value)) .strftime('%Y-%m-%d %H:%M:%S')) logger.info('Last date: lastDumpDate: %s, lastDumpDateUrgently: %s', datetime.fromtimestamp(int(self.update_dump.lastDumpDate // 1000)).strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp(int(self.update_dump.lastDumpDateUrgently // 1000)) .strftime('%Y-%m-%d %H:%M:%S')) if last_date_dump != current_date_dump or Dump.get(Dump.param == 'lastResult').value == 'Error': logger.info('New dump is available.') # Dump.update(value=last_dump.lastDumpDate // 1000).where(Dump.param == 'lastDumpDate').execute() # Dump.update(value=last_dump.lastDumpDateUrgently // 1000) \ # .where(Dump.param == 'lastDumpDateUrgently').execute() Dump.update(value='getLastDumpDate').where(Dump.param == 'lastAction').execute() Dump.update(value='NewDump').where(Dump.param == 'lastResult').execute() return True else: logger.info('Dump date without changes.') Dump.update(value='getLastDumpDate').where(Dump.param == 'lastAction').execute() Dump.update(value='lastDump').where(Dump.param == 'lastResult').execute() return False
def check_service_upd(self): msg = '' logger.info('Current versions: webservice: %s, dump: %s, doc: %s', Dump.get(Dump.param == 'webServiceVersion').value, Dump.get(Dump.param == 'dumpFormatVersion').value, Dump.get(Dump.param == 'docVersion').value) if self.update_dump.webServiceVersion != Dump.get(Dump.param == 'webServiceVersion').value: logger.warning('New webservice: %s', self.update_dump.webServiceVersion) msg = msg + 'Current webservice:' + Dump.get(Dump.param == 'webServiceVersion').value + \ '\nNew webservice: ' + self.update_dump.webServiceVersion + '\n\n' Dump.update(value=self.update_dump.webServiceVersion).where(Dump.param == 'webServiceVersion').execute() if self.update_dump.dumpFormatVersion != Dump.get(Dump.param == 'dumpFormatVersion').value: logger.warning('New dumpFormatVersion: %s', self.update_dump.dumpFormatVersion) msg = msg + 'Current dumpFormatVersion: ' + Dump.get(Dump.param == 'dumpFormatVersion').value + \ '\nNew dumpFormatVersion: ' + self.update_dump.dumpFormatVersion + '\n\n' Dump.update(value=self.update_dump.dumpFormatVersion).where(Dump.param == 'dumpFormatVersion').execute() if self.update_dump.docVersion != Dump.get(Dump.param == 'docVersion').value: logger.warning('New docVersion: %s', self.update_dump.docVersion) msg = msg + 'Current docVersion: ' + Dump.get(Dump.param == 'docVersion').value + '\nNew docVersion: ' + \ self.update_dump.docVersion + '\n\n' Dump.update(value=self.update_dump.docVersion).where(Dump.param == 'docVersion').execute() # print(msg) return msg
def parse_dump(self): if not os.path.exists(self.path_py + '/dump.xml'): logger.info('dump.xml not found: s%', self.path_py + '/dump.xml') return 0 logger.info('dump.xml already exists.') tree_xml = ElementTree().parse(self.path_py + '/dump.xml') dt = datetime.strptime(tree_xml.attrib['updateTime'][:19], '%Y-%m-%dT%H:%M:%S') update_time = int(time.mktime(dt.timetuple())) Dump.update(value=update_time).where(Dump.param == 'lastDumpDate').execute() logger.info('Got updateTime: %s.', update_time) dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19], '%Y-%m-%dT%H:%M:%S') update_time_urgently = int(time.mktime(dt.timetuple())) Dump.update(value=update_time_urgently).where(Dump.param == 'lastDumpDateUrgently').execute() logger.info('Got updateTimeUrgently: %s.', update_time_urgently) list_xml = tree_xml.findall(".//*[@id]") id_set_dump = set() id_set_db = set() for content_xml in list_xml: # print(content_xml.tag, content_xml.attrib, content_xml.text) id_set_dump.add(int(content_xml.attrib['id'])) select_content_id_db = Item.select(Item.content_id).where(Item.purge >> None) for content_db in select_content_id_db: id_set_db.add(content_db.content_id) common_id_set = id_set_dump.intersection(id_set_db) delete_id_set = id_set_db.difference(common_id_set) add_id_set = id_set_dump.difference(common_id_set) # print(delete_id_set) # print(add_id_set) if len(delete_id_set) > 0: with self.transact.atomic(): for del_item in delete_id_set: logger.info('Full delete Item, IP, Domain, URL id: %s.', del_item) Item.update(purge=self.code_id).where(Item.content_id == del_item, Item.purge >> None).execute() Domain.update(purge=self.code_id).where(Domain.content_id == del_item, Domain.purge >> None).execute() URL.update(purge=self.code_id).where(URL.content_id == del_item, URL.purge >> None).execute() IP.update(purge=self.code_id).where(IP.content_id == del_item, IP.purge >> None).execute() if len(add_id_set) > 0: include_time = str() urgency_type = int() entry_type = int() block_type = str() hash_value = str() with self.transact.atomic(): for new_item in add_id_set: logger.info('New Item, IP, Domain, URL id: %s.', new_item) new_item_xml = tree_xml.find(".//content[@id='" + str(new_item) + "']") for data_xml in new_item_xml.iter(): if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) try: urgency_type = int(data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) hash_value = data_xml.attrib['hash'] if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] item_new = Item(content_id=content_id, includeTime=include_time, urgencyType=urgency_type, entryType=entry_type, blockType=block_type, hashRecord=hash_value, decision_date=decision_date, decision_num=decision_number, decision_org=decision_org, add=self.code_id) item_new.save() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote(url_split[1]) else: url = data_xml.text URL.create(item=item_new.id, content_id=content_id, url=url, add=self.code_id) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str(data_xml.text).encode('idna')).decode() else: domain = data_xml.text Domain.create(item=item_new.id, content_id=content_id, domain=domain, add=self.code_id) if data_xml.tag == 'ip': ip = data_xml.text IP.create(item=item_new.id, content_id=content_id, ip=ip, add=self.code_id) if data_xml.tag == 'ipSubnet': net = data_xml.text.split('/') ip = net[0] mask = net[1] IP.create(item=item_new.id, content_id=content_id, ip=ip, mask=mask, add=self.code_id) url_db_set = set() url_xml_set = set() ip_db_set = set() ip_xml_set = set() sub_ip_xml_set = set() sub_ip_db_set = set() domain_db_set = set() domain_xml_set = set() data_update = False with self.transact.atomic(): for item_xml in list_xml: for data_xml in item_xml.iter(): # print(data_xml.tag, data_xml.attrib, data_xml.text) if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) hash_value = data_xml.attrib['hash'] item_db = Item.get(Item.content_id == content_id, Item.purge >> None) if hash_value != item_db.hashRecord: logger.info('Hashes not equal, update hash id: %s', content_id) try: urgency_type = int(data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) item_db.hashRecord = hash_value # Item.update(purge=None).where(Item.content_id == content_id).execute() data_update = True else: data_update = False break if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] # print(item_db) if str(item_db.includeTime) != include_time: logger.info('content_id: %s.', content_id) logger.info('XML includeTime: %s.', include_time) logger.info('DB includeTime: %s.', item_db.includeTime) item_db.includeTime = include_time # Item.update(includeTime=include_time).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.urgencyType != urgency_type: logger.info('content_id: %s.', content_id) logger.info('XML urgencyType: %s.', urgency_type) logger.info('DB urgencyType: %s.', item_db.urgencyType) item_db.urgencyType = urgency_type # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.blockType != block_type: logger.info('content_id: %s.', content_id) logger.info('XML blockType: %s.', block_type) logger.info('DB blockType: %s.', item_db.blockType) item_db.blockType = block_type # Item.update(blockType=block_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.entryType != entry_type: logger.info('content_id: %s.', content_id) logger.info('XML entryType: %s.', entry_type) logger.info('DB entryType: %s.', item_db.entryType) item_db.entryType = entry_type # Item.update(entryType=entry_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if str(item_db.decision_date) != decision_date: logger.info('content_id: %s.', content_id) logger.info('XML date: %s.', decision_date) logger.info('DB date: %s.', str(item_db.decision_date)) item_db.decision_date = decision_date # Item.update(decision_date=decision_date).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_num != decision_number: logger.info('content_id: %s.', content_id) logger.info('XML number: %s.', decision_number) logger.info('DB number: %s.', item_db.decision_num) item_db.decision_num = decision_number # Item.update(decision_num=decision_number).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_org != decision_org: logger.info('content_id: %s.', content_id) logger.info('XML org: %s.', decision_org) logger.info('DB org: %s.', item_db.decision_org) item_db.decision_org = decision_org # Item.update(decision_org=decision_org).where(Item.content_id == content_id, # Item.purge >> None).execute() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote(url_split[1]) else: url = data_xml.text url_xml_set.add(url) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str(data_xml.text).encode('idna')).decode() else: domain = data_xml.text domain_xml_set.add(domain) if data_xml.tag == 'ip': ip_xml_set.add(data_xml.text) if data_xml.tag == 'ipSubnet': sub_ip_xml_set.add(data_xml.text) if data_update: url_db = URL.select().where(URL.item == item_db.id, URL.purge >> None) for url_item in url_db: url_db_set.add(url_item.url) if url_db_set != url_xml_set: common_url_set = url_xml_set.intersection(url_db_set) delete_url_set = url_db_set.difference(common_url_set) add_url_set = url_xml_set.difference(common_url_set) if len(delete_url_set) > 0: logger.info('Delete id %s URL: %s', content_id, delete_url_set) for delete_url in delete_url_set: URL.update(purge=self.code_id).where(URL.item == item_db.id, URL.url == delete_url, URL.purge >> None).execute() if len(add_url_set) > 0: logger.info('Add id %s URL: %s', content_id, add_url_set) for add_url in add_url_set: URL.create(item=item_db.id, content_id=item_db.content_id, url=add_url, add=self.code_id) url_db_set.clear() url_xml_set.clear() domain_db = Domain.select().where(Domain.item == item_db.id, Domain.purge >> None) for domain_item in domain_db: domain_db_set.add(domain_item.domain) if domain_db_set != domain_xml_set: common_domain_set = domain_xml_set.intersection(domain_db_set) delete_domain_set = domain_db_set.difference(common_domain_set) add_domain_set = domain_xml_set.difference(common_domain_set) if len(delete_domain_set) > 0: logger.info('Delete id %s Domain: %s', content_id, delete_domain_set) for delete_domain in delete_domain_set: Domain.update(purge=self.code_id).where(Domain.item == item_db.id, Domain.domain == delete_domain, Domain.purge >> None).execute() if len(add_domain_set) > 0: logger.info('Add id %s Domain: %s', content_id, add_domain_set) for add_domain in add_domain_set: Domain.create(item=item_db.id, content_id=item_db.content_id, domain=add_domain, add=self.code_id) domain_db_set.clear() domain_xml_set.clear() ip_db = IP.select().where(IP.item == item_db.id, IP.mask == 32, IP.purge >> None) for ip_item in ip_db: ip_db_set.add(ip_item.ip) if ip_db_set != ip_xml_set: common_ip_set = ip_xml_set.intersection(ip_db_set) delete_ip_set = ip_db_set.difference(common_ip_set) add_ip_set = ip_xml_set.difference(common_ip_set) if len(delete_ip_set) > 0: logger.info('Delete id %s ip: %s', content_id, delete_ip_set) for delete_ip in delete_ip_set: IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == delete_ip, IP.mask == 32, IP.purge >> None).execute() if len(add_ip_set) > 0: logger.info('Add id %s ip: %s', content_id, add_ip_set) for add_ip in add_ip_set: IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, add=self.code_id) ip_db_set.clear() ip_xml_set.clear() sub_ip_db = IP.select().where(IP.item == item_db.id, IP.mask < 32, IP.purge >> None) for sub_ip_item in sub_ip_db: sub_ip_db_set.add(str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask)) if sub_ip_db_set != sub_ip_xml_set: common_sub_ip_set = sub_ip_xml_set.intersection(sub_ip_db_set) delete_sub_ip_set = sub_ip_db_set.difference(common_sub_ip_set) add_sub_ip_set = sub_ip_xml_set.difference(common_sub_ip_set) if len(delete_sub_ip_set) > 0: logger.info('Delete id %s subnet: %s', content_id, delete_sub_ip_set) for delete_sub_ip in delete_sub_ip_set: del_subnet = str(delete_sub_ip).split('/') del_ip = del_subnet[0] del_mask = del_subnet[1] IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == del_ip, IP.mask == del_mask, IP.purge >> None).execute() if len(add_sub_ip_set) > 0: logger.info('Add id %s subnet: %s', content_id, add_sub_ip_set) for add_sub_ip in add_sub_ip_set: add_subnet = str(add_sub_ip).split('/') add_ip = add_subnet[0] add_mask = add_subnet[1] IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, mask=add_mask, add=self.code_id) item_db.save() sub_ip_db_set.clear() sub_ip_xml_set.clear() if self.check_diff(): self.cleaner() return 1 else: logger.info('no updates') self.cleaner() return 2
def check_new_dump(self): logger.info('Check if dump.xml has updates since last sync.') if self.cfg.lastDumpDateUrgently() and not self.cfg.lastDumpDate(): last_date_dump = self.update_dump.lastDumpDateUrgently // 1000 current_date_dump = int( Dump.get(Dump.param == 'lastDumpDateUrgently').value) elif self.cfg.lastDumpDate() and not self.cfg.lastDumpDateUrgently(): last_date_dump = self.update_dump.lastDumpDate // 1000 current_date_dump = int( Dump.get(Dump.param == 'lastDumpDate').value) else: last_date_dump = max(self.update_dump.lastDumpDate // 1000, self.update_dump.lastDumpDateUrgently // 1000) current_date_dump = max( int(Dump.get(Dump.param == 'lastDumpDate').value), int(Dump.get(Dump.param == 'lastDumpDateUrgently').value)) logger.info( 'Current date: lastDumpDate: %s, lastDumpDateUrgently: %s', datetime.fromtimestamp( int(Dump.get(Dump.param == 'lastDumpDate').value)).strftime( '%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp( int(Dump.get(Dump.param == 'lastDumpDateUrgently').value)). strftime('%Y-%m-%d %H:%M:%S')) logger.info( 'Last date: lastDumpDate: %s, lastDumpDateUrgently: %s', datetime.fromtimestamp(int(self.update_dump.lastDumpDate // 1000)).strftime('%Y-%m-%d %H:%M:%S'), datetime.fromtimestamp( int(self.update_dump.lastDumpDateUrgently // 1000)).strftime('%Y-%m-%d %H:%M:%S')) if last_date_dump != current_date_dump or Dump.get( Dump.param == 'lastResult').value == 'Error': logger.info('New dump is available.') # Dump.update(value=last_dump.lastDumpDate // 1000).where(Dump.param == 'lastDumpDate').execute() # Dump.update(value=last_dump.lastDumpDateUrgently // 1000) \ # .where(Dump.param == 'lastDumpDateUrgently').execute() Dump.update(value='getLastDumpDate').where( Dump.param == 'lastAction').execute() Dump.update(value='NewDump').where( Dump.param == 'lastResult').execute() return True else: logger.info('Dump date without changes.') Dump.update(value='getLastDumpDate').where( Dump.param == 'lastAction').execute() Dump.update(value='lastDump').where( Dump.param == 'lastResult').execute() return False
def check_service_upd(self): msg = '' logger.info('Current versions: webservice: %s, dump: %s, doc: %s', Dump.get(Dump.param == 'webServiceVersion').value, Dump.get(Dump.param == 'dumpFormatVersion').value, Dump.get(Dump.param == 'docVersion').value) if self.update_dump.webServiceVersion != Dump.get( Dump.param == 'webServiceVersion').value: logger.warning('New webservice: %s', self.update_dump.webServiceVersion) msg = msg + 'Current webservice:' + Dump.get(Dump.param == 'webServiceVersion').value + \ '\nNew webservice: ' + self.update_dump.webServiceVersion + '\n\n' Dump.update(value=self.update_dump.webServiceVersion).where( Dump.param == 'webServiceVersion').execute() if self.update_dump.dumpFormatVersion != Dump.get( Dump.param == 'dumpFormatVersion').value: logger.warning('New dumpFormatVersion: %s', self.update_dump.dumpFormatVersion) msg = msg + 'Current dumpFormatVersion: ' + Dump.get(Dump.param == 'dumpFormatVersion').value + \ '\nNew dumpFormatVersion: ' + self.update_dump.dumpFormatVersion + '\n\n' Dump.update(value=self.update_dump.dumpFormatVersion).where( Dump.param == 'dumpFormatVersion').execute() if self.update_dump.docVersion != Dump.get( Dump.param == 'docVersion').value: logger.warning('New docVersion: %s', self.update_dump.docVersion) msg = msg + 'Current docVersion: ' + Dump.get(Dump.param == 'docVersion').value + '\nNew docVersion: ' + \ self.update_dump.docVersion + '\n\n' Dump.update(value=self.update_dump.docVersion).where( Dump.param == 'docVersion').execute() # print(msg) return msg
def parse_dump(self): if not os.path.exists(self.path_py + '/dump.xml'): logger.info('dump.xml not found: s%', self.path_py + '/dump.xml') return 0 logger.info('dump.xml already exists.') tree_xml = ElementTree().parse(self.path_py + '/dump.xml') dt = datetime.strptime(tree_xml.attrib['updateTime'][:19], '%Y-%m-%dT%H:%M:%S') update_time = int(time.mktime(dt.timetuple())) Dump.update(value=update_time).where( Dump.param == 'lastDumpDate').execute() logger.info('Got updateTime: %s.', update_time) dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19], '%Y-%m-%dT%H:%M:%S') update_time_urgently = int(time.mktime(dt.timetuple())) Dump.update(value=update_time_urgently).where( Dump.param == 'lastDumpDateUrgently').execute() logger.info('Got updateTimeUrgently: %s.', update_time_urgently) list_xml = tree_xml.findall(".//*[@id]") id_set_dump = set() id_set_db = set() for content_xml in list_xml: # print(content_xml.tag, content_xml.attrib, content_xml.text) id_set_dump.add(int(content_xml.attrib['id'])) select_content_id_db = Item.select( Item.content_id).where(Item.purge >> None) for content_db in select_content_id_db: id_set_db.add(content_db.content_id) common_id_set = id_set_dump.intersection(id_set_db) delete_id_set = id_set_db.difference(common_id_set) add_id_set = id_set_dump.difference(common_id_set) # print(delete_id_set) # print(add_id_set) if len(delete_id_set) > 0: with self.transact.atomic(): for del_item in delete_id_set: logger.info('Full delete Item, IP, Domain, URL id: %s.', del_item) Item.update(purge=self.code_id).where( Item.content_id == del_item, Item.purge >> None).execute() Domain.update(purge=self.code_id).where( Domain.content_id == del_item, Domain.purge >> None).execute() URL.update(purge=self.code_id).where( URL.content_id == del_item, URL.purge >> None).execute() IP.update(purge=self.code_id).where( IP.content_id == del_item, IP.purge >> None).execute() if len(add_id_set) > 0: include_time = str() urgency_type = int() entry_type = int() block_type = str() hash_value = str() with self.transact.atomic(): for new_item in add_id_set: logger.info('New Item, IP, Domain, URL id: %s.', new_item) new_item_xml = tree_xml.find(".//content[@id='" + str(new_item) + "']") for data_xml in new_item_xml.iter(): if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) try: urgency_type = int( data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db( data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) hash_value = data_xml.attrib['hash'] if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] item_new = Item(content_id=content_id, includeTime=include_time, urgencyType=urgency_type, entryType=entry_type, blockType=block_type, hashRecord=hash_value, decision_date=decision_date, decision_num=decision_number, decision_org=decision_org, add=self.code_id) item_new.save() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote( url_split[1]) else: url = data_xml.text URL.create(item=item_new.id, content_id=content_id, url=url, add=self.code_id) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str( data_xml.text).encode('idna')).decode() else: domain = data_xml.text Domain.create(item=item_new.id, content_id=content_id, domain=domain, add=self.code_id) if data_xml.tag == 'ip': ip = data_xml.text IP.create(item=item_new.id, content_id=content_id, ip=ip, add=self.code_id) if data_xml.tag == 'ipSubnet': net = data_xml.text.split('/') ip = net[0] mask = net[1] IP.create(item=item_new.id, content_id=content_id, ip=ip, mask=mask, add=self.code_id) url_db_set = set() url_xml_set = set() ip_db_set = set() ip_xml_set = set() sub_ip_xml_set = set() sub_ip_db_set = set() domain_db_set = set() domain_xml_set = set() data_update = False with self.transact.atomic(): for item_xml in list_xml: for data_xml in item_xml.iter(): # print(data_xml.tag, data_xml.attrib, data_xml.text) if data_xml.tag == 'content': content_id = int(data_xml.attrib['id']) hash_value = data_xml.attrib['hash'] item_db = Item.get(Item.content_id == content_id, Item.purge >> None) if hash_value != item_db.hashRecord: logger.info('Hashes not equal, update hash id: %s', content_id) try: urgency_type = int( data_xml.attrib['urgencyType']) except KeyError: urgency_type = 0 include_time = self.date_time_xml_to_db( data_xml.attrib['includeTime']) try: block_type = data_xml.attrib['blockType'] except KeyError: block_type = 'default' entry_type = int(data_xml.attrib['entryType']) item_db.hashRecord = hash_value # Item.update(purge=None).where(Item.content_id == content_id).execute() data_update = True else: data_update = False break if data_xml.tag == 'decision': decision_date = data_xml.attrib['date'] decision_number = data_xml.attrib['number'] decision_org = data_xml.attrib['org'] # print(item_db) if str(item_db.includeTime) != include_time: logger.info('content_id: %s.', content_id) logger.info('XML includeTime: %s.', include_time) logger.info('DB includeTime: %s.', item_db.includeTime) item_db.includeTime = include_time # Item.update(includeTime=include_time).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.urgencyType != urgency_type: logger.info('content_id: %s.', content_id) logger.info('XML urgencyType: %s.', urgency_type) logger.info('DB urgencyType: %s.', item_db.urgencyType) item_db.urgencyType = urgency_type # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.blockType != block_type: logger.info('content_id: %s.', content_id) logger.info('XML blockType: %s.', block_type) logger.info('DB blockType: %s.', item_db.blockType) item_db.blockType = block_type # Item.update(blockType=block_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.entryType != entry_type: logger.info('content_id: %s.', content_id) logger.info('XML entryType: %s.', entry_type) logger.info('DB entryType: %s.', item_db.entryType) item_db.entryType = entry_type # Item.update(entryType=entry_type).where(Item.content_id == content_id, # Item.purge >> None).execute() if str(item_db.decision_date) != decision_date: logger.info('content_id: %s.', content_id) logger.info('XML date: %s.', decision_date) logger.info('DB date: %s.', str(item_db.decision_date)) item_db.decision_date = decision_date # Item.update(decision_date=decision_date).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_num != decision_number: logger.info('content_id: %s.', content_id) logger.info('XML number: %s.', decision_number) logger.info('DB number: %s.', item_db.decision_num) item_db.decision_num = decision_number # Item.update(decision_num=decision_number).where(Item.content_id == content_id, # Item.purge >> None).execute() if item_db.decision_org != decision_org: logger.info('content_id: %s.', content_id) logger.info('XML org: %s.', decision_org) logger.info('DB org: %s.', item_db.decision_org) item_db.decision_org = decision_org # Item.update(decision_org=decision_org).where(Item.content_id == content_id, # Item.purge >> None).execute() if data_xml.tag == 'url': if not self.only_ascii(data_xml.text): url_split = str(data_xml.text).split(':') url = url_split[0] + ':' + urllib.parse.quote( url_split[1]) else: url = data_xml.text url_xml_set.add(url) if data_xml.tag == 'domain': if not self.only_ascii(data_xml.text): domain = (str( data_xml.text).encode('idna')).decode() else: domain = data_xml.text domain_xml_set.add(domain) if data_xml.tag == 'ip': ip_xml_set.add(data_xml.text) if data_xml.tag == 'ipSubnet': sub_ip_xml_set.add(data_xml.text) if data_update: url_db = URL.select().where(URL.item == item_db.id, URL.purge >> None) for url_item in url_db: url_db_set.add(url_item.url) if url_db_set != url_xml_set: common_url_set = url_xml_set.intersection(url_db_set) delete_url_set = url_db_set.difference(common_url_set) add_url_set = url_xml_set.difference(common_url_set) if len(delete_url_set) > 0: logger.info('Delete id %s URL: %s', content_id, delete_url_set) for delete_url in delete_url_set: URL.update(purge=self.code_id).where( URL.item == item_db.id, URL.url == delete_url, URL.purge >> None).execute() if len(add_url_set) > 0: logger.info('Add id %s URL: %s', content_id, add_url_set) for add_url in add_url_set: URL.create(item=item_db.id, content_id=item_db.content_id, url=add_url, add=self.code_id) url_db_set.clear() url_xml_set.clear() domain_db = Domain.select().where( Domain.item == item_db.id, Domain.purge >> None) for domain_item in domain_db: domain_db_set.add(domain_item.domain) if domain_db_set != domain_xml_set: common_domain_set = domain_xml_set.intersection( domain_db_set) delete_domain_set = domain_db_set.difference( common_domain_set) add_domain_set = domain_xml_set.difference( common_domain_set) if len(delete_domain_set) > 0: logger.info('Delete id %s Domain: %s', content_id, delete_domain_set) for delete_domain in delete_domain_set: Domain.update(purge=self.code_id).where( Domain.item == item_db.id, Domain.domain == delete_domain, Domain.purge >> None).execute() if len(add_domain_set) > 0: logger.info('Add id %s Domain: %s', content_id, add_domain_set) for add_domain in add_domain_set: Domain.create(item=item_db.id, content_id=item_db.content_id, domain=add_domain, add=self.code_id) domain_db_set.clear() domain_xml_set.clear() ip_db = IP.select().where(IP.item == item_db.id, IP.mask == 32, IP.purge >> None) for ip_item in ip_db: ip_db_set.add(ip_item.ip) if ip_db_set != ip_xml_set: common_ip_set = ip_xml_set.intersection(ip_db_set) delete_ip_set = ip_db_set.difference(common_ip_set) add_ip_set = ip_xml_set.difference(common_ip_set) if len(delete_ip_set) > 0: logger.info('Delete id %s ip: %s', content_id, delete_ip_set) for delete_ip in delete_ip_set: IP.update(purge=self.code_id).where( IP.item == item_db.id, IP.ip == delete_ip, IP.mask == 32, IP.purge >> None).execute() if len(add_ip_set) > 0: logger.info('Add id %s ip: %s', content_id, add_ip_set) for add_ip in add_ip_set: IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, add=self.code_id) ip_db_set.clear() ip_xml_set.clear() sub_ip_db = IP.select().where(IP.item == item_db.id, IP.mask < 32, IP.purge >> None) for sub_ip_item in sub_ip_db: sub_ip_db_set.add( str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask)) if sub_ip_db_set != sub_ip_xml_set: common_sub_ip_set = sub_ip_xml_set.intersection( sub_ip_db_set) delete_sub_ip_set = sub_ip_db_set.difference( common_sub_ip_set) add_sub_ip_set = sub_ip_xml_set.difference( common_sub_ip_set) if len(delete_sub_ip_set) > 0: logger.info('Delete id %s subnet: %s', content_id, delete_sub_ip_set) for delete_sub_ip in delete_sub_ip_set: del_subnet = str(delete_sub_ip).split('/') del_ip = del_subnet[0] del_mask = del_subnet[1] IP.update(purge=self.code_id).where( IP.item == item_db.id, IP.ip == del_ip, IP.mask == del_mask, IP.purge >> None).execute() if len(add_sub_ip_set) > 0: logger.info('Add id %s subnet: %s', content_id, add_sub_ip_set) for add_sub_ip in add_sub_ip_set: add_subnet = str(add_sub_ip).split('/') add_ip = add_subnet[0] add_mask = add_subnet[1] IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, mask=add_mask, add=self.code_id) item_db.save() sub_ip_db_set.clear() sub_ip_xml_set.clear() if self.check_diff(): self.cleaner() return 1 else: logger.info('no updates') # print('no updates') return 2
__author__ = 'ipetrash' import json import sys sys.path.append('..') from db import Dump import peewee from playhouse.shortcuts import dict_to_model from export import FILE_NAME_EXPORT_JSON items = json.load(open(FILE_NAME_EXPORT_JSON, encoding='utf-8')) print('items:', len(items)) print('Dump count before import:', Dump.select().count()) for x in items: try: dump = dict_to_model(Dump, x) dump.save(force_insert=True) print(f'Import {x}') except peewee.IntegrityError as e: # Ignore error "UNIQUE constraint failed: dump.id" pass print('Current dump count:', Dump.select().count())
def statistics_show(self, diff=0, stdout=False): date_time = datetime.fromtimestamp(int(Dump.get(Dump.param == 'lastDumpDate') .value)).strftime('%Y-%m-%d %H:%M:%S') message = 'vigruzki.rkn.gov.ru update: ' + date_time + '\n' url_add_sql = self._url_diff_sql(diff, 'ignore', 1) message += '\nURLs added: \n\n' for url_add in url_add_sql: message += url_add.url + '\n' ip_add_sql = self._ip_diff_sql(diff, 'ignore', 1) message += '\nIPs added: \n\n' for ip_add in ip_add_sql: if ip_add.mask < 32: message += ip_add.ip + '/' + str(ip_add.mask) else: message += ip_add.ip + '\n' domain_add_sql = self._domain_diff_sql(diff, 'ignore', 1) message += '\nDOMAINs added: \n\n' for domain_add in domain_add_sql: message += domain_add.domain + '\n' url_del_sql = self._url_diff_sql(diff, 'ignore', 0) message += '\nURLs deleted: \n\n' for url_del in url_del_sql: message += url_del.url + '\n' ip_del_sql = self._ip_diff_sql(diff, 'ignore', 0) message += '\nIPs deleted: \n\n' for ip_del in ip_del_sql: if ip_del.mask < 32: message += ip_del.ip + '/' + str(ip_del.mask) else: message += ip_del.ip + '\n' domain_del_sql = self._domain_diff_sql(diff, 'ignore', 0) message += '\nDOMAINs deleted: \n\n' for domain_del in domain_del_sql: message += domain_del.domain + '\n' rb_list = self.idx_list[:diff] domain_count = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\ .where(~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list))).scalar() url_count = URL.select(fn.Count(fn.Distinct(URL.url)))\ .where(~(URL.add << rb_list) & ((URL.purge >> None) | (URL.purge << rb_list))).scalar() ip_count = IP.select(fn.Count(fn.Distinct(IP.ip)))\ .where(~(IP.add << rb_list) & ((IP.purge >> None) | (IP.purge << rb_list))).scalar() id_count = Item.select(fn.Count(fn.Distinct(Item.content_id)))\ .where(~(Item.add << rb_list) & ((Item.purge >> None) | (Item.purge << rb_list))).scalar() message += '\nURLs count: ' + str(url_count) + '\n' message += 'IPs count: ' + str(ip_count) + '\n' message += 'DOMAINs count: ' + str(domain_count) + '\n' message += 'Item count: ' + str(id_count) + '\n' if stdout: print(message) return False else: return message
log.info('Start.') if Path(FILE_NAME_GAMES).exists(): backup_file_name = str(FILE_NAME_BACKUP / (DT.datetime.today().strftime('%d%m%y-%H%M%S_') + Path(FILE_NAME_GAMES).name)) shutil.copy(FILE_NAME_GAMES, backup_file_name) log.info(f'Save backup to: {backup_file_name}') log.info('') log.info('Loading cache...') game_by_genres = load(FILE_NAME_GAMES) log.info(f'game_by_genres ({len(game_by_genres)}): {game_by_genres}') new_game_by_genres = Dump.dump() log.info( f'new_game_by_genres ({len(new_game_by_genres)}): {new_game_by_genres}') genre_translate = load() log.info(f'genre_translate ({len(genre_translate)}): {genre_translate}') log.info('Finish loading cache.') log.info('') log.info('Search games...') number = 0 for game, genres in new_game_by_genres.items(): if game in game_by_genres:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' import json from pathlib import Path import sys sys.path.append('..') from db import Dump from playhouse.shortcuts import model_to_dict DIR = Path(__file__).parent.resolve() / 'data' DIR.mkdir(parents=True, exist_ok=True) FILE_NAME_EXPORT_JSON = DIR / 'games.json' if __name__ == '__main__': items = [model_to_dict(dump) for dump in Dump.select()] print(len(items)) json.dump(items, open(FILE_NAME_EXPORT_JSON, 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
def run_parser(parser, games: list, max_num_request=5): try: pauses = [ ('15 minutes', 15 * 60), ('30 minutes', 30 * 60), ('45 minutes', 45 * 60), ('1 hour', 60 * 60), ] SITE_NAME = parser.get_site_name() timeout = 3 # 3 seconds MAX_TIMEOUT = 10 # 10 seconds TIMEOUT_EVERY_N_GAMES = 50 # Every 50 games TIMEOUT_BETWEEN_N_GAMES = 3 * 60 # 3 minutes number = 0 for game_name in games: try: if Dump.exists(SITE_NAME, game_name): continue number += 1 num_request = 0 while True: num_request += 1 try: if num_request == 1: log.info(f'#{number}. Search genres for {game_name!r} ({SITE_NAME})') else: log.info(f'#{number}. Search genres for {game_name!r} ({SITE_NAME}). ' f'Attempts {num_request}/{max_num_request}') genres = parser.get_game_genres(game_name) log.info(f'#{number}. Found genres {game_name!r} ({SITE_NAME}): {genres}') Dump.add(SITE_NAME, game_name, genres) counter.inc() time.sleep(timeout) break except: log.exception(f'#{number}. Error on request {num_request}/{max_num_request} ({SITE_NAME})') if num_request >= max_num_request: log.info(f'#{number}. Attempts ended for {game_name!r} ({SITE_NAME})') break pause_text, pause_secs = pauses[num_request - 1] log.info(f'#{number}. Pause: {pause_text} secs') time.sleep(pause_secs) timeout += 1 if timeout > MAX_TIMEOUT: timeout = MAX_TIMEOUT if number % TIMEOUT_EVERY_N_GAMES == 0: log.info( f'#{number}. Pause for every {TIMEOUT_EVERY_N_GAMES} games: {TIMEOUT_BETWEEN_N_GAMES} secs' ) time.sleep(TIMEOUT_BETWEEN_N_GAMES) except: log.exception(f'#{number}. Error by game {game_name!r} ({SITE_NAME})') except: log.exception(f'Error:')