コード例 #1
0
 def add_dlrobot_file(self,
                      sha256,
                      file_extension,
                      web_refs=[],
                      decl_refs=[]):
     src_doc = self.output_dlrobot_human.get_document_maybe(sha256)
     if src_doc is None:
         src_doc = TSourceDocument(file_extension)
         self.output_dlrobot_human.update_source_document(sha256, src_doc)
     for web_ref in web_refs:
         src_doc.add_web_reference(web_ref)
     for decl_ref in decl_refs:
         src_doc.add_decl_reference(decl_ref)
     self.output_dlrobot_human.update_source_document(sha256, src_doc)
コード例 #2
0
    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()