def write_and_read(self, file_name): # write to storage a new file output_file = self.process_with_client([file_name])[0] self.assertTrue(os.path.exists(output_file)) file_size = os.stat(output_file).st_size hash_code = build_dislosures_sha256(output_file) # read file os.unlink(output_file) output_file_copy = self.process_with_client([file_name])[0] self.assertTrue(os.path.exists(output_file_copy)) self.assertEqual(os.stat(output_file_copy).st_size, file_size) self.assertEqual(hash_code, build_dislosures_sha256(output_file_copy)) return file_size, hash_code
def get_text_of_a_document(source_file, keep_txt=False, reuse_txt=False, output_folder=None): global EXTERNAl_CONVERTORS ec = EXTERNAl_CONVERTORS _, file_extension = os.path.splitext(source_file) file_extension = file_extension.lower() if output_folder is None: txt_file = source_file + ".txt" else: txt_file = os.path.join(output_folder, os.path.basename(source_file) + ".txt") if reuse_txt and os.path.exists(txt_file): pass elif file_extension == ".xlsx": ec.run_xlsx2csv(source_file, txt_file) elif file_extension == ".xls": res = ec.run_xls2csv(source_file, txt_file) if res != 0: temp_fname = source_file + ".xlsx" shutil.copy(source_file, temp_fname) ec.run_xlsx2csv(temp_fname, txt_file) os.unlink(temp_fname) elif file_extension == ".docx": ec.run_office2txt(source_file, txt_file) elif file_extension == ".pdf": temp_file = source_file + ".docx" sha256 = build_dislosures_sha256(source_file) if TDocConversionClient( TDocConversionClient.parse_args([])).retrieve_document( sha256, temp_file) and os.path.exists(temp_file): ec.run_office2txt(temp_file, txt_file) else: # the worse case, let's use calibre ec.run_calibre(source_file, txt_file) if os.path.exists(temp_file): os.unlink(temp_file) elif file_extension in {".html", ".rtf", ".htm"}: ec.run_calibre(source_file, txt_file) elif file_extension == ".doc": res = ec.run_catdoc(source_file, txt_file) if res != 0: temp_fname = source_file + ".docx" shutil.copy(source_file, temp_fname) ec.run_office2txt(temp_fname, txt_file) os.unlink(temp_fname) else: return None if os.path.exists(txt_file): doc_text = read_input_text(txt_file) if not keep_txt: os.unlink(txt_file) return doc_text else: return None
def test_many_bin_files(self): file_data1 = b"12345_1" with open("test1.txt", "wb") as outp: outp.write(file_data1) file_data2 = b"12345_2" with open("test2.txt", "wb") as outp: outp.write(file_data2) self.assertTrue(self.env.client.send_file("test1.txt")) self.assertTrue(self.env.client.send_file("test2.txt")) stats = self.env.client.get_stats() self.assertEqual(stats['bin_files_count'], 2) file_data_, _ = self.env.client.retrieve_file_data_by_sha256( build_dislosures_sha256("test1.txt")) self.assertEqual(file_data1, file_data_) file_data_, _ = self.env.client.retrieve_file_data_by_sha256( build_dislosures_sha256("test2.txt")) self.assertEqual(file_data2, file_data_)
def test_send_file_and_retrieve(self): file_data = b"12345" with open("test.txt", "wb") as outp: outp.write(file_data) self.assertTrue(self.env.client.send_file("test.txt")) stats = self.env.client.get_stats() self.assertEqual(stats['source_doc_count'], 1) sha256 = build_dislosures_sha256("test.txt") file_data1, file_extension = self.env.client.retrieve_file_data_by_sha256( sha256) self.assertEqual(file_data1, file_data) self.assertEqual(file_extension, ".txt")
def test_reload(self): file_data1 = b"12345_1" file_path = "test8484.txt" with open(file_path, "wb") as outp: outp.write(file_data1) self.assertTrue(self.env.client.send_file("test8484.txt")) stats = self.env.client.get_stats() self.env.server.file_storage.close_file_storage() self.env.server.file_storage.load_from_disk() file_data_, _ = self.env.client.retrieve_file_data_by_sha256( build_dislosures_sha256(file_path)) self.assertEqual(file_data1, file_data_)
def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser()
def receive_files(self, sent_files): errors_count = 0 for filepath in sent_files: self.logger.debug("download docx for {}".format(filepath)) sha256hash = build_dislosures_sha256(filepath) outfile = filepath + ".docx" if self.args.output_folder is not None: outfile = os.path.join(self.args.output_folder, os.path.basename(outfile)) if self.retrieve_document(sha256hash, outfile): if os.path.exists(outfile): self.logger.debug("save {}".format(outfile)) else: self.logger.error( "cannot download docx for file {}".format(filepath)) errors_count += 1 return errors_count == 0
def _send_file_to_conversion_db(self, filename, file_extension, rebuild): with open(filename, "rb") as f: file_contents = f.read() starter = file_contents[:5].decode('latin', errors="ignore") if starter != '%PDF-': self.logger.debug( "{} has bad pdf starter, do not send it".format(filename)) return hashcode = build_dislosures_sha256(filename) if hashcode in self._sent_tasks: return if not rebuild: if self.check_file_was_converted(hashcode): return self.logger.debug("register conversion task for {}".format(filename)) if self._register_task(file_extension, file_contents, hashcode, rebuild): self.all_pdf_size_sent_to_conversion += Path( filename).stat().st_size
def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id): path, declarator_filename = os.path.split(declarator_url_path) _, ext = os.path.splitext(declarator_filename) ext = ext.lower() temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext)) declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path)) declarator_url = declarator_url.replace('\\', '/') for file_name in self.download_file_and_unzip(declarator_url, temp_file): self.source_doc_client.send_file(file_name) if file_name.lower().endswith('.pdf'): _, extension = os.path.splitext(file_name) self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension) self.new_pdfs.add(build_dislosures_sha256(file_name)) else: self.smart_parser_server_client.send_file(file_name) yield file_name, declarator_url self.pdf_conversion_client.wait_all_tasks_to_be_sent() for f in os.listdir(self.args.tmp_folder): os.unlink(os.path.join(self.args.tmp_folder, f))
def test_winword_hangs(self): file_path = "../files/winword2019_hangs.pdf" output_files = self.process_with_client([file_path], timeout=240, skip_receiving=True) sha256 = build_dislosures_sha256(file_path) for x in range(120): time.sleep(1) # server must answer and accept requests while winword is working(hanging) in background self.assertTrue( self.client.assert_declarator_conv_alive( raise_exception=False)) if self.client.check_file_was_converted(sha256): self.client.retrieve_document(sha256, output_files[0]) break self.assertTrue(os.path.exists(output_files[0])) file_size = os.stat(output_files[0]).st_size self.assertGreater(file_size, 5000) stats = self.server.get_stats() self.assertEqual(1, stats['finished_ocr_tasks'])
from common.primitives import build_dislosures_sha256 import sys if __name__ == '__main__': for i in sys.argv[1:]: print("{} -> {}".format(i, build_dislosures_sha256(i)))
def retrieve_json_by_source_file(self, file_path): return self.retrieve_json_by_sha256(build_dislosures_sha256(file_path))
def test_import_second_passport(self): self.assertGreater(models.Office.objects.count(), 0) models.Section.objects.all().delete() models.Source_Document.objects.all().delete() permalinks_folder = os.path.dirname(__file__) TPermalinksManager(setup_logging(), { 'directory': permalinks_folder }).create_empty_dbs() domains_folder = os.path.join(os.path.dirname(__file__), "domains_1") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, domains_folder): importer.handle(None, dlrobot_human="dlrobot_human_1.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) self.assertEqual(models.Section.objects.count(), 1) self.assertEqual(models.RealEstate.objects.count(), 6) self.assertEqual(models.Vehicle.objects.count(), 1) section_id1 = list(models.Section.objects.all())[0].id # one more time, but now we have two vehicles for the same person (same document), as though smart_parser # is more intelligent TPermalinksManager(setup_logging(), { 'directory': permalinks_folder }).create_permalinks() # clear the db models.Vehicle.objects.all().delete() models.RealEstate.objects.all().delete() models.Income.objects.all().delete() models.Section.objects.all().delete() models.Source_Document.objects.all().delete() domains_folder = os.path.join(os.path.dirname(__file__), "domains_1") sp_workdir = os.path.join(os.path.dirname(__file__), "smart_parser_server") importer = ImportJsonCommand(None, None) os.chdir(os.path.dirname(__file__)) with SmartParserServerForTesting(sp_workdir, domains_folder) as sp_wrapper: sha256 = build_dislosures_sha256( os.path.join(os.path.dirname(__file__), "domains_1/test1.ru/fsin.docx")) sp_json = json.loads( sp_wrapper.server.get_smart_parser_json(sha256)) assert len(sp_json['persons'][0]['vehicles']) == 1 sp_json['persons'][0]['vehicles'] = list() sp_wrapper.server.register_built_smart_parser_json( sha256, json.dumps(sp_json).encode('utf8')) importer.handle(None, dlrobot_human="dlrobot_human_1.json", smart_parser_human_json="human_jsons", permalinks_folder=permalinks_folder) self.assertEqual(models.Section.objects.count(), 1) self.assertEqual(models.RealEstate.objects.count(), 6) self.assertEqual(models.Vehicle.objects.count(), 0) section_id2 = list(models.Section.objects.all())[0].id self.assertEqual(section_id1, section_id2)