def add_dlrobot_file(self, sha256, file_extension, web_refs=[], decl_refs=[]): src_doc = self.output_dlrobot_human.get_document_maybe(sha256) if src_doc is None: src_doc = TSourceDocument(file_extension) self.output_dlrobot_human.update_source_document(sha256, src_doc) for web_ref in web_refs: src_doc.add_web_reference(web_ref) for decl_ref in decl_refs: src_doc.add_decl_reference(decl_ref) self.output_dlrobot_human.update_source_document(sha256, src_doc)
def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser()