Ejemplo n.º 1
0
def main(args, logger):
    if args.conversion_server is not None:
        TDocConversionClient.DECLARATOR_CONV_URL = args.conversion_server
    conv_tasks = TDocConversionClient(logger)
    conv_tasks.start_conversion_thread()

    try:
        sent_files = send_files(args, logger, conv_tasks)
        if args.receive_files and len(sent_files) > 0:
            conv_tasks.wait_doc_conversion_finished(args.conversion_timeout)
        else:
            logger.debug("stop conversion finished")
            conv_tasks.stop_conversion_thread()
    except Exception as exp:
        logger.error("exception: {}, stop_conversion_thread".format(exp))
        conv_tasks.stop_conversion_thread()
    if args.receive_files:
        if not receive_files(logger, conv_tasks, sent_files):
            return 1
    return 0
Ejemplo n.º 2
0
class TExportHumanFiles:

    @staticmethod
    def parse_args(arg_list):
        parser = argparse.ArgumentParser()
        parser.add_argument("--table", dest='table', default="declarations_documentfile")
        parser.add_argument("--document-file-id", dest='document_file_id', required=False)
        parser.add_argument("--tmp-folder", dest='tmp_folder', default=None)
        parser.add_argument("--dlrobot-human-json", dest='dlrobot_human_json', default="human_files.json")
        parser.add_argument("--start-from-an-empty-file", dest='start_from_empty', action="store_true", default=False)
        parser.add_argument("--max-files-count", dest='max_files_count', type=int)
        parser.add_argument("--mysql-port", dest='mysql_port', type=int, default=None)
        parser.add_argument("--pdf-conversion-timeout", dest='pdf_conversion_timeout',
                                default=1*60*60,
                                type=int,
                                help="pdf conversion timeout")
        parser.add_argument("--pdf-conversion-queue-limit", dest='pdf_conversion_queue_limit', type=int,
                            default=100 * 2 ** 20, help="max sum size of al pdf files that are in pdf conversion queue",
                            required=False)

        return parser.parse_args(arg_list)

    def __init__(self, args):
        self.logger = setup_logging(log_file_name="export_human_files.log")
        self.args = args
        if self.args.tmp_folder is None:
            self.args.tmp_folder = tempfile.mkdtemp("export_human")
            self.logger.debug("create folder {}".format(self.args.tmp_folder))
        else:
            self.logger.debug("rm folder {}".format(self.args.tmp_folder))
            shutil.rmtree(self.args.tmp_folder, ignore_errors=True)
            os.mkdir(self.args.tmp_folder)
        self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger)
        self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger)
        self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger)
        self.new_pdfs = set()

    def __enter__(self):
        self.pdf_conversion_client.start_conversion_thread()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.pdf_conversion_client.stop_conversion_thread()
        shutil.rmtree(self.args.tmp_folder, ignore_errors=True)

    def unarchive(self, input_file):
        base_name, file_extension = os.path.splitext(os.path.basename(input_file))
        output_folder = os.path.dirname(input_file)
        dearchiver = TDearchiver(self.logger, output_folder)
        for _, _, filename in dearchiver.dearchive_one_archive(file_extension, input_file, base_name):
            yield filename

    def download_file_and_unzip(self, file_url, filename):
        file_without_extension, extension = os.path.splitext(filename)
        if not os.path.isfile(filename):
            self.logger.debug("download {0}  to {1}".format(file_url, filename))
            result = requests.get(file_url)
            with open(filename, 'wb') as fd:
                fd.write(result.content)
            if extension == '.zip':
                try:
                    for archive_filename in self.unarchive(filename):
                        yield archive_filename
                except Exception as e:
                    self.logger.error("cannot unzip  {}, exception={}".format(filename, e))
            else:
                yield filename
        else:
            if extension == '.zip':
                for archive_filename in glob.glob("{}_*".format(file_without_extension)):
                    yield archive_filename
            else:
                yield filename

    def get_all_file_sql_records(self):
        if self.args.mysql_port is None:
            db = pymysql.connect(db="declarator", user="******", password="******", unix_socket="/var/run/mysqld/mysqld.sock" )
        else:
            db = pymysql.connect(db="declarator", user="******", password="******",
                                 port=self.args.mysql_port)
        cursor = db.cursor()
        if self.args.document_file_id is not None:
            where_clause = "where f.id = {}\n".format(self.args.document_file_id)
        else:
            where_clause = ""
        query = ("""
                    select f.id, d.id, f.file, f.link, d.office_id, d.income_year 
                    from {} f
                    join declarations_document d on f.document_id=d.id
                    {} 
                 """.format(self.args.table, where_clause))
        self.logger.debug(query.replace("\n", " "))
        cursor.execute(query)
        for (document_file_id, document_id, filename, link, office_id, income_year) in cursor:
            if filename is not None and len(filename) > 0:
                yield document_file_id, document_id, filename, link, office_id, income_year

        cursor.close()
        db.close()

    def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id):
        path, declarator_filename = os.path.split(declarator_url_path)
        _, ext = os.path.splitext(declarator_filename)
        ext = ext.lower()
        temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext))
        declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path))
        declarator_url = declarator_url.replace('\\', '/')

        for file_name in self.download_file_and_unzip(declarator_url, temp_file):
            self.source_doc_client.send_file(file_name)
            if file_name.lower().endswith('.pdf'):
                _, extension = os.path.splitext(file_name)
                self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension)
                self.new_pdfs.add(build_dislosures_sha256(file_name))
            else:
                self.smart_parser_server_client.send_file(file_name)
            yield file_name, declarator_url

        self.pdf_conversion_client.wait_all_tasks_to_be_sent()
        for f in os.listdir(self.args.tmp_folder):
            os.unlink(os.path.join(self.args.tmp_folder, f))

    def fix_list(self, sha256, office_id):
        fixed_office_id = FIX_LIST.get(sha256)
        if fixed_office_id is not None:
            return fixed_office_id
        else:
            return office_id

    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()

    def send_new_pdfs_to_smart_parser(self):
        self.logger.debug("wait pdf conversion for {} seconds".format(self.args.pdf_conversion_timeout))
        self.pdf_conversion_client.wait_doc_conversion_finished(self.args.pdf_conversion_timeout)

        missed_pdf_count = 0
        received_pdf_count = 0
        for sha256 in self.new_pdfs:
            self.logger.debug("try to converted file for {}".format(sha256))
            handle, temp_filename = tempfile.mkstemp(suffix=".docx")
            os.close(handle)
            if self.pdf_conversion_client.retrieve_document(sha256, temp_filename):
                received_pdf_count += 1
                self.logger.debug("send the converted file to smart parser")
                self.smart_parser_server_client.send_file(temp_filename)
            else:
                self.logger.error("converted file is not received")
                missed_pdf_count += 1
            os.unlink(temp_filename)
        if missed_pdf_count > 0:
            self.logger.error('received_pdf_count = {}, missed_pdf_count={}'.format(received_pdf_count, missed_pdf_count))
Ejemplo n.º 3
0
class TTestConvBase(TestCase):
    def __init__(self, methodName='runTest'):
        super().__init__(methodName)
        self.port = 8081
        self.name = None
        self.data_folder = None
        self.server_address = "localhost:{}".format(self.port)
        self.server = None
        self.server_thread = None
        self.server_process = None
        self.client = None
        self.converters = TExternalConverters(enable_smart_parser=False,
                                              enable_calibre=False,
                                              enable_cat_doc=False,
                                              enable_xls2csv=False,
                                              enable_office_2_txt=False)

        self.pdf_ocr_folder = os.path.join(os.path.dirname(__file__),
                                           "pdf.ocr")
        self.pdf_ocr_out_folder = os.path.join(os.path.dirname(__file__),
                                               "pdf.ocr.out")
        if not os.path.exists(self.pdf_ocr_folder) or not os.path.exists(
                self.pdf_ocr_out_folder):
            raise Exception(
                "run python update_finereader_task.py, and upload test.hft to finreader hot folder"
            )
        self.project_file = "converted_file_storage.json"
        self.client = None
        self.server_args = None
        self.client_count = 0

    def start_server_thread(self):
        self.server = TConvertProcessor(
            TConvertProcessor.parse_args(self.server_args))
        self.server_thread = threading.Thread(target=start_server,
                                              args=(self.server, ))
        self.server_thread.start()

    def setup_server(self,
                     name,
                     addit_server_args=list(),
                     start_process=False):
        self.name = name
        self.data_folder = os.path.join(os.path.dirname(__file__),
                                        "data.{}".format(name))

        recreate_folder(self.data_folder)

        os.chdir(self.data_folder)
        input_files = "input_files"
        recreate_folder(input_files)

        db_converted_files = os.path.join(self.data_folder,
                                          "db_converted_files")
        recreate_folder(db_converted_files)

        db_input_files = os.path.join(self.data_folder, "db_input_files")
        recreate_folder(db_input_files)

        log_file = "db_conv.log"
        if os.path.exists(log_file):
            os.unlink(log_file)

        clear_folder_with_retry(self.pdf_ocr_folder)
        clear_folder_with_retry(self.pdf_ocr_out_folder)
        TConvertStorage.create_empty_db(db_input_files, db_converted_files,
                                        self.project_file)

        self.server_args = [
            "--server-address", self.server_address, '--logfile', log_file,
            '--db-json', self.project_file, '--disable-killing-winword',
            '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder',
            self.pdf_ocr_out_folder, '--disable-telegram'
        ] + addit_server_args

        if start_process:
            server_script = os.path.join(os.path.dirname(__file__), "..",
                                         "conv_storage_server.py")
            args = ["python", server_script] + self.server_args
            self.server_process = subprocess.Popen(args,
                                                   stderr=subprocess.DEVNULL,
                                                   stdout=subprocess.DEVNULL)
        else:
            self.start_server_thread()

    def restart_server(self):
        self.server.stop_http_server()
        self.server_thread.join(0)
        self.start_server_thread()

    def process_with_client(self,
                            input_files,
                            timeout=None,
                            rebuild=False,
                            skip_receiving=False,
                            log_name="client",
                            input_task_timeout=5):
        output_files = list(os.path.basename(i) + ".docx" for i in input_files)
        for o in output_files:
            if os.path.exists(o):
                os.unlink(o)
        client_args = [
            "--server-address",
            self.server_address,
            "--conversion-timeout",
            "180",
            "--output-folder",
            ".",
        ] + input_files
        if timeout is not None:
            client_args.extend(['--conversion-timeout', str(timeout)])
        if rebuild:
            client_args.append('--rebuild')
        if skip_receiving:
            client_args.append('--skip-receiving')
        if self.client_count >= 0 and log_name == "client":
            log_name = log_name + str(self.client_count)
        logger = setup_logging(logger_name=log_name)
        try:
            self.client_count += 1
            self.client = TDocConversionClient(
                TDocConversionClient.parse_args(client_args), logger=logger)
            self.client.input_task_timeout = input_task_timeout
            self.client.start_conversion_thread()
            self.client.process_files()
            return output_files
        finally:
            close_logger(logger)

    def list2reason(self, exc_list):
        if exc_list and exc_list[-1][0] is self:
            return exc_list[-1][1]

    def tear_down(self):
        result = self.defaultTestResult()
        self._feedErrorsToResult(result, self._outcome.errors)
        error = self.list2reason(result.errors)
        failure = self.list2reason(result.failures)
        delete_temp_files = not error and not failure

        if self.client is not None:
            self.client.stop_conversion_thread(1)
            self.client = None

        if self.server is not None:
            self.server.stop_http_server()
            self.server_thread.join(0)
            self.server = None
        else:
            self.server_process.kill()
            self.server_process = None

        time.sleep(5)

        os.chdir(os.path.dirname(__file__))