예제 #1
0
    def write_and_read(self, file_name):
        # write to  storage a new file
        output_file = self.process_with_client([file_name])[0]
        self.assertTrue(os.path.exists(output_file))
        file_size = os.stat(output_file).st_size
        hash_code = build_dislosures_sha256(output_file)

        # read  file
        os.unlink(output_file)
        output_file_copy = self.process_with_client([file_name])[0]
        self.assertTrue(os.path.exists(output_file_copy))
        self.assertEqual(os.stat(output_file_copy).st_size, file_size)
        self.assertEqual(hash_code, build_dislosures_sha256(output_file_copy))
        return file_size, hash_code
예제 #2
0
def get_text_of_a_document(source_file,
                           keep_txt=False,
                           reuse_txt=False,
                           output_folder=None):
    global EXTERNAl_CONVERTORS
    ec = EXTERNAl_CONVERTORS
    _, file_extension = os.path.splitext(source_file)
    file_extension = file_extension.lower()
    if output_folder is None:
        txt_file = source_file + ".txt"
    else:
        txt_file = os.path.join(output_folder,
                                os.path.basename(source_file) + ".txt")

    if reuse_txt and os.path.exists(txt_file):
        pass
    elif file_extension == ".xlsx":
        ec.run_xlsx2csv(source_file, txt_file)
    elif file_extension == ".xls":
        res = ec.run_xls2csv(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".xlsx"
            shutil.copy(source_file, temp_fname)
            ec.run_xlsx2csv(temp_fname, txt_file)
            os.unlink(temp_fname)
    elif file_extension == ".docx":
        ec.run_office2txt(source_file, txt_file)
    elif file_extension == ".pdf":
        temp_file = source_file + ".docx"
        sha256 = build_dislosures_sha256(source_file)
        if TDocConversionClient(
                TDocConversionClient.parse_args([])).retrieve_document(
                    sha256, temp_file) and os.path.exists(temp_file):
            ec.run_office2txt(temp_file, txt_file)
        else:
            # the worse case, let's use calibre
            ec.run_calibre(source_file, txt_file)
        if os.path.exists(temp_file):
            os.unlink(temp_file)
    elif file_extension in {".html", ".rtf", ".htm"}:
        ec.run_calibre(source_file, txt_file)
    elif file_extension == ".doc":
        res = ec.run_catdoc(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".docx"
            shutil.copy(source_file, temp_fname)
            ec.run_office2txt(temp_fname, txt_file)
            os.unlink(temp_fname)
    else:
        return None
    if os.path.exists(txt_file):
        doc_text = read_input_text(txt_file)
        if not keep_txt:
            os.unlink(txt_file)
        return doc_text
    else:
        return None
예제 #3
0
    def test_many_bin_files(self):
        file_data1 = b"12345_1"
        with open("test1.txt", "wb") as outp:
            outp.write(file_data1)
        file_data2 = b"12345_2"
        with open("test2.txt", "wb") as outp:
            outp.write(file_data2)

        self.assertTrue(self.env.client.send_file("test1.txt"))
        self.assertTrue(self.env.client.send_file("test2.txt"))
        stats = self.env.client.get_stats()
        self.assertEqual(stats['bin_files_count'], 2)

        file_data_, _ = self.env.client.retrieve_file_data_by_sha256(
            build_dislosures_sha256("test1.txt"))
        self.assertEqual(file_data1, file_data_)

        file_data_, _ = self.env.client.retrieve_file_data_by_sha256(
            build_dislosures_sha256("test2.txt"))
        self.assertEqual(file_data2, file_data_)
예제 #4
0
 def test_send_file_and_retrieve(self):
     file_data = b"12345"
     with open("test.txt", "wb") as outp:
         outp.write(file_data)
     self.assertTrue(self.env.client.send_file("test.txt"))
     stats = self.env.client.get_stats()
     self.assertEqual(stats['source_doc_count'], 1)
     sha256 = build_dislosures_sha256("test.txt")
     file_data1, file_extension = self.env.client.retrieve_file_data_by_sha256(
         sha256)
     self.assertEqual(file_data1, file_data)
     self.assertEqual(file_extension, ".txt")
예제 #5
0
    def test_reload(self):
        file_data1 = b"12345_1"
        file_path = "test8484.txt"
        with open(file_path, "wb") as outp:
            outp.write(file_data1)

        self.assertTrue(self.env.client.send_file("test8484.txt"))

        stats = self.env.client.get_stats()
        self.env.server.file_storage.close_file_storage()
        self.env.server.file_storage.load_from_disk()

        file_data_, _ = self.env.client.retrieve_file_data_by_sha256(
            build_dislosures_sha256(file_path))
        self.assertEqual(file_data1, file_data_)
예제 #6
0
    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()
예제 #7
0
 def receive_files(self, sent_files):
     errors_count = 0
     for filepath in sent_files:
         self.logger.debug("download docx for {}".format(filepath))
         sha256hash = build_dislosures_sha256(filepath)
         outfile = filepath + ".docx"
         if self.args.output_folder is not None:
             outfile = os.path.join(self.args.output_folder,
                                    os.path.basename(outfile))
         if self.retrieve_document(sha256hash, outfile):
             if os.path.exists(outfile):
                 self.logger.debug("save {}".format(outfile))
         else:
             self.logger.error(
                 "cannot download docx for file {}".format(filepath))
             errors_count += 1
     return errors_count == 0
예제 #8
0
    def _send_file_to_conversion_db(self, filename, file_extension, rebuild):
        with open(filename, "rb") as f:
            file_contents = f.read()
            starter = file_contents[:5].decode('latin', errors="ignore")
            if starter != '%PDF-':
                self.logger.debug(
                    "{} has bad pdf starter, do  not send it".format(filename))
                return
        hashcode = build_dislosures_sha256(filename)
        if hashcode in self._sent_tasks:
            return

        if not rebuild:
            if self.check_file_was_converted(hashcode):
                return
        self.logger.debug("register conversion task for {}".format(filename))
        if self._register_task(file_extension, file_contents, hashcode,
                               rebuild):
            self.all_pdf_size_sent_to_conversion += Path(
                filename).stat().st_size
예제 #9
0
    def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id):
        path, declarator_filename = os.path.split(declarator_url_path)
        _, ext = os.path.splitext(declarator_filename)
        ext = ext.lower()
        temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext))
        declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path))
        declarator_url = declarator_url.replace('\\', '/')

        for file_name in self.download_file_and_unzip(declarator_url, temp_file):
            self.source_doc_client.send_file(file_name)
            if file_name.lower().endswith('.pdf'):
                _, extension = os.path.splitext(file_name)
                self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension)
                self.new_pdfs.add(build_dislosures_sha256(file_name))
            else:
                self.smart_parser_server_client.send_file(file_name)
            yield file_name, declarator_url

        self.pdf_conversion_client.wait_all_tasks_to_be_sent()
        for f in os.listdir(self.args.tmp_folder):
            os.unlink(os.path.join(self.args.tmp_folder, f))
예제 #10
0
    def test_winword_hangs(self):
        file_path = "../files/winword2019_hangs.pdf"
        output_files = self.process_with_client([file_path],
                                                timeout=240,
                                                skip_receiving=True)
        sha256 = build_dislosures_sha256(file_path)
        for x in range(120):
            time.sleep(1)
            # server must answer and accept requests while winword is working(hanging) in background
            self.assertTrue(
                self.client.assert_declarator_conv_alive(
                    raise_exception=False))
            if self.client.check_file_was_converted(sha256):
                self.client.retrieve_document(sha256, output_files[0])
                break

        self.assertTrue(os.path.exists(output_files[0]))
        file_size = os.stat(output_files[0]).st_size
        self.assertGreater(file_size, 5000)
        stats = self.server.get_stats()
        self.assertEqual(1, stats['finished_ocr_tasks'])
from common.primitives import build_dislosures_sha256
import sys

if __name__ == '__main__':
    for i in sys.argv[1:]:
        print("{} -> {}".format(i, build_dislosures_sha256(i)))
예제 #12
0
 def retrieve_json_by_source_file(self, file_path):
     return self.retrieve_json_by_sha256(build_dislosures_sha256(file_path))
    def test_import_second_passport(self):
        self.assertGreater(models.Office.objects.count(), 0)
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        permalinks_folder = os.path.dirname(__file__)
        TPermalinksManager(setup_logging(), {
            'directory': permalinks_folder
        }).create_empty_dbs()

        domains_folder = os.path.join(os.path.dirname(__file__), "domains_1")
        sp_workdir = os.path.join(os.path.dirname(__file__),
                                  "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))

        with SmartParserServerForTesting(sp_workdir, domains_folder):
            importer.handle(None,
                            dlrobot_human="dlrobot_human_1.json",
                            smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(models.Section.objects.count(), 1)
        self.assertEqual(models.RealEstate.objects.count(), 6)
        self.assertEqual(models.Vehicle.objects.count(), 1)
        section_id1 = list(models.Section.objects.all())[0].id

        # one more time, but now we have two vehicles for the same person (same document), as though smart_parser
        # is more intelligent
        TPermalinksManager(setup_logging(), {
            'directory': permalinks_folder
        }).create_permalinks()

        # clear the db
        models.Vehicle.objects.all().delete()
        models.RealEstate.objects.all().delete()
        models.Income.objects.all().delete()
        models.Section.objects.all().delete()
        models.Source_Document.objects.all().delete()

        domains_folder = os.path.join(os.path.dirname(__file__), "domains_1")
        sp_workdir = os.path.join(os.path.dirname(__file__),
                                  "smart_parser_server")

        importer = ImportJsonCommand(None, None)
        os.chdir(os.path.dirname(__file__))

        with SmartParserServerForTesting(sp_workdir,
                                         domains_folder) as sp_wrapper:
            sha256 = build_dislosures_sha256(
                os.path.join(os.path.dirname(__file__),
                             "domains_1/test1.ru/fsin.docx"))
            sp_json = json.loads(
                sp_wrapper.server.get_smart_parser_json(sha256))
            assert len(sp_json['persons'][0]['vehicles']) == 1
            sp_json['persons'][0]['vehicles'] = list()
            sp_wrapper.server.register_built_smart_parser_json(
                sha256,
                json.dumps(sp_json).encode('utf8'))
            importer.handle(None,
                            dlrobot_human="dlrobot_human_1.json",
                            smart_parser_human_json="human_jsons",
                            permalinks_folder=permalinks_folder)

        self.assertEqual(models.Section.objects.count(), 1)
        self.assertEqual(models.RealEstate.objects.count(), 6)
        self.assertEqual(models.Vehicle.objects.count(), 0)
        section_id2 = list(models.Section.objects.all())[0].id

        self.assertEqual(section_id1, section_id2)