def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser()
def get_all_documents(self): k = self.db.firstkey() while k is not None: js = json.loads(self.db[k]) sha256 = k.decode('latin') yield sha256, TSourceDocument().from_json(js) k = self.db.nextkey(k)
def register_document_in_database(self, sha256, src_doc: TSourceDocument): source_document_in_db = models.Source_Document( sha256=sha256, intersection_status=src_doc.build_intersection_status(), ) source_document_in_db.id, new_file = self.permalinks_db_source_document.get_source_doc_id_by_sha256( sha256) assert not models.Source_Document.objects.filter( id=source_document_in_db.id).exists() self.logger.debug("register doc sha256={} id={}, new_file={}".format( sha256, source_document_in_db.id, new_file)) source_document_in_db.file_extension = src_doc.file_extension source_document_in_db.save() ref: TDeclaratorReference for ref in src_doc.decl_references: models.Declarator_File_Reference( source_document=source_document_in_db, declarator_documentfile_id=ref.document_file_id, declarator_document_id=ref.document_id, web_domain=ref._site_url, declarator_document_file_url=ref.document_file_url).save() ref: TWebReference for ref in src_doc.web_references: models.Web_Reference(source_document=source_document_in_db, dlrobot_url=ref.url, web_domain=ref._site_url, crawl_epoch=ref.crawl_epoch).save() return source_document_in_db
def set_office_id(self, sha256, src_doc: TSourceDocument, office_id, method_name: str): old_office_id = src_doc.calculated_office_id if old_office_id is None or office_id == old_office_id: self.logger.debug("set file {} office_id={} ({} )".format( sha256, office_id, method_name)) else: self.logger.info("change office_id from {} to {} for file {} , ({})".format( \ old_office_id, office_id, sha256, method_name)) src_doc.calculated_office_id = office_id self.dlrobot_human.update_source_document(sha256, src_doc)
def calc_income_year(self, input_json, src_doc: TSourceDocument, section_json, section_index): # take year from a particular declarant (many declarants with different year in one file) # do not use here default value for get, since smart_parser explicitly write "year": null year = section_json.get('year') if year is not None: return int(year) year = src_doc.calc_document_income_year(input_json) # if year is absent, then the file is useless if year is None: raise TSmartParserSectionJson.SerializerException( "year is not defined: section No {}".format(section_index)) return int(year)
def add_dlrobot_file(self, sha256, file_extension, web_refs=[], decl_refs=[]): src_doc = self.output_dlrobot_human.get_document_maybe(sha256) if src_doc is None: src_doc = TSourceDocument(file_extension) self.output_dlrobot_human.update_source_document(sha256, src_doc) for web_ref in web_refs: src_doc.add_web_reference(web_ref) for decl_ref in decl_refs: src_doc.add_decl_reference(decl_ref) self.output_dlrobot_human.update_source_document(sha256, src_doc)
def predict_tax_office(self, sha256, src_doc: TSourceDocument): web_ref: TWebReference for web_ref in src_doc.web_references: if web_ref._site_url.endswith("service.nalog.ru"): if src_doc.region_id is None: smart_parser_json = self.smart_parser_server_client.retrieve_json_by_sha256(sha256) if smart_parser_json is None: return False props = smart_parser_json.get('document_sheet_props') if props is None or len(props) == 0 or 'url' not in props[0]: return False url = props[0]['url'] region_str = url[:url.find('.')] if not region_str.isdigit(): return False src_doc.region_id = int(region_str) office_id = self.regional_tax_offices.get(src_doc.region_id) if office_id is not None: self.set_office_id(sha256, src_doc, office_id, "regional tax office") return True return False
def convert_from_json_fle(self, json_path: str): with open(json_path) as inp: js = json.load(inp) for k, v in js['documents'].items(): self.update_source_document(k, TSourceDocument().from_json(v))
def get_document_maybe(self, sha256): s = self.db.get(sha256) if s is None: return s return TSourceDocument().from_json(json.loads(s))
def get_document(self, sha256) -> TSourceDocument: return TSourceDocument().from_json(json.loads(self.db[sha256]))
def update_source_document(self, sha256, src_doc: TSourceDocument): assert self.access_mode != 'r' self.db[sha256] = json.dumps(src_doc.write_to_json(), ensure_ascii=False)