Esempio n. 1
0
def get_text_of_a_document(source_file,
                           keep_txt=False,
                           reuse_txt=False,
                           output_folder=None):
    global EXTERNAl_CONVERTORS
    ec = EXTERNAl_CONVERTORS
    _, file_extension = os.path.splitext(source_file)
    file_extension = file_extension.lower()
    if output_folder is None:
        txt_file = source_file + ".txt"
    else:
        txt_file = os.path.join(output_folder,
                                os.path.basename(source_file) + ".txt")

    if reuse_txt and os.path.exists(txt_file):
        pass
    elif file_extension == ".xlsx":
        ec.run_xlsx2csv(source_file, txt_file)
    elif file_extension == ".xls":
        res = ec.run_xls2csv(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".xlsx"
            shutil.copy(source_file, temp_fname)
            ec.run_xlsx2csv(temp_fname, txt_file)
            os.unlink(temp_fname)
    elif file_extension == ".docx":
        ec.run_office2txt(source_file, txt_file)
    elif file_extension == ".pdf":
        temp_file = source_file + ".docx"
        sha256 = build_dislosures_sha256(source_file)
        if TDocConversionClient(
                TDocConversionClient.parse_args([])).retrieve_document(
                    sha256, temp_file) and os.path.exists(temp_file):
            ec.run_office2txt(temp_file, txt_file)
        else:
            # the worse case, let's use calibre
            ec.run_calibre(source_file, txt_file)
        if os.path.exists(temp_file):
            os.unlink(temp_file)
    elif file_extension in {".html", ".rtf", ".htm"}:
        ec.run_calibre(source_file, txt_file)
    elif file_extension == ".doc":
        res = ec.run_catdoc(source_file, txt_file)
        if res != 0:
            temp_fname = source_file + ".docx"
            shutil.copy(source_file, temp_fname)
            ec.run_office2txt(temp_fname, txt_file)
            os.unlink(temp_fname)
    else:
        return None
    if os.path.exists(txt_file):
        doc_text = read_input_text(txt_file)
        if not keep_txt:
            os.unlink(txt_file)
        return doc_text
    else:
        return None
 def __init__(self, args):
     self.logger = setup_logging(log_file_name="export_human_files.log")
     self.args = args
     if self.args.tmp_folder is None:
         self.args.tmp_folder = tempfile.mkdtemp("export_human")
         self.logger.debug("create folder {}".format(self.args.tmp_folder))
     else:
         self.logger.debug("rm folder {}".format(self.args.tmp_folder))
         shutil.rmtree(self.args.tmp_folder, ignore_errors=True)
         os.mkdir(self.args.tmp_folder)
     self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger)
     self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger)
     self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger)
     self.new_pdfs = set()
Esempio n. 3
0
 def __init__(self, args):
     self.register_task_result_error_count = 0
     self.logger = setup_logging(log_file_name=args.log_file_name,
                                 append_mode=True)
     self.conversion_client = TDocConversionClient(
         TDocConversionClient.parse_args([]), self.logger)
     self.args = args
     rounds = TDeclarationRounds(args.round_file)
     self.dlrobot_remote_calls = TRemoteDlrobotCallList(
         logger=self.logger,
         file_name=args.remote_calls_file,
         min_start_time_stamp=rounds.start_time_stamp)
     self.worker_2_running_tasks = defaultdict(list)
     self.worker_2_continuous_failures_count = defaultdict(int)
     offices = TOfficeTableInMemory()
     offices.read_from_local_file(self.args.offices_file)
     self.web_sites_db = TDeclarationWebSiteList(self.logger,
                                                 offices=offices)
     if not os.path.exists(self.args.result_folder):
         os.makedirs(self.args.result_folder)
     self.web_sites_to_process = self.find_projects_to_process()
     self.cloud_id_to_worker_ip = dict()
     self.config = TRobotConfig.read_by_config_type(
         self.args.dlrobot_config_type)
     self.last_remote_call = None  # for testing
     host, port = self.args.server_address.split(":")
     self.logger.debug("start server on {}:{}".format(host, port))
     super().__init__((host, int(port)), TDlrobotRequestHandler)
     self.last_service_action_time_stamp = time.time()
     self.service_action_count = 0
     self.decl_sender = TDeclarationSender(
         self.logger, self.args.enable_smart_parser,
         self.args.enable_source_doc_server)
     self.stop_process = False
     if self.args.enable_ip_checking:
         self.permitted_hosts = set(
             str(x)
             for x in ipaddress.ip_network('192.168.100.0/24').hosts())
         self.permitted_hosts.add('127.0.0.1')
         self.permitted_hosts.add('95.165.96.61')  # disclosures.ru
     self.logger.debug("init complete")
     self.send_to_telegram("start dlrobot central with {} tasks".format(
         len(self.web_sites_to_process)))
Esempio n. 4
0
    def process_all_tasks(self):
        if len(self.ocr_tasks) == 0:
            self.ocr_queue_is_empty_last_time_stamp = time.time()
        self.try_convert_with_winword()
        new_files_from_ocr = self.process_docx_from_ocr()
        if new_files_from_ocr:
            self.got_ocred_file_last_time_stamp = time.time()
        # file garbage tasks
        current_time = time.time()
        if current_time - self.file_garbage_collection_timestamp >= 60:  # just not too often
            self.file_garbage_collection_timestamp = current_time
            if not self.http_server_is_working:
                return
            self.process_ocr_logs()
            if not self.http_server_is_working:
                return
            self.process_stalled_files()

        if current_time - self.self_server_ping_timestamp >= 3600:  # just not too often
            args = TDocConversionClient.parse_args(
                ["--server-address", self.args.server_address])
            client = TDocConversionClient(args, self.logger)
            if not client.assert_declarator_conv_alive(raise_exception=False):
                self.logger.error("cannot ping itself, exit")
                self.stop_http_server(run_shutdown=False)
                sys.exit(1)
            self.self_server_ping_timestamp = current_time

        current_time = time.time()
        if  current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \
                current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time :
            self.logger.debug(
                "last ocr file was received long ago and all this time the ocr queue was not empty"
            )
            if not self.http_server_is_working:
                return
            self.restart_ocr()
            self.got_ocred_file_last_time_stamp = time.time(
            )  #otherwize restart will be too often
 def process_with_client(self,
                         input_files,
                         timeout=None,
                         rebuild=False,
                         skip_receiving=False,
                         log_name="client",
                         input_task_timeout=5):
     output_files = list(os.path.basename(i) + ".docx" for i in input_files)
     for o in output_files:
         if os.path.exists(o):
             os.unlink(o)
     client_args = [
         "--server-address",
         self.server_address,
         "--conversion-timeout",
         "180",
         "--output-folder",
         ".",
     ] + input_files
     if timeout is not None:
         client_args.extend(['--conversion-timeout', str(timeout)])
     if rebuild:
         client_args.append('--rebuild')
     if skip_receiving:
         client_args.append('--skip-receiving')
     if self.client_count >= 0 and log_name == "client":
         log_name = log_name + str(self.client_count)
     logger = setup_logging(logger_name=log_name)
     try:
         self.client_count += 1
         self.client = TDocConversionClient(
             TDocConversionClient.parse_args(client_args), logger=logger)
         self.client.input_task_timeout = input_task_timeout
         self.client.start_conversion_thread()
         self.client.process_files()
         return output_files
     finally:
         close_logger(logger)
Esempio n. 6
0
import json
from ConvStorage.conversion_client import TDocConversionClient
import argparse
import os
import time


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--history-file", dest='history_file', default=None)
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    conv_client = TDocConversionClient(TDocConversionClient.parse_args([]), logging)
    stats = conv_client.get_stats()
    if args.history_file is None:
        print(json.dumps(stats))
    else:
        lines = list()
        if os.path.exists(args.history_file):
            with open(args.history_file, "r", encoding="utf-8") as inp:
                for l in inp:
                    lines.append(l)
        lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats)))
        lines = lines[-400:]
        with open(args.history_file, "w", encoding="utf-8") as out:
            for l in lines:
                out.write(l)
Esempio n. 7
0
from ConvStorage.conversion_client import TDocConversionClient
from common.logging_wrapper import setup_logging

import sys

if __name__ == '__main__':
    logger = setup_logging(log_file_name="convert_pdf.log")
    client = TDocConversionClient(
        TDocConversionClient.parse_args(sys.argv[1:]), logger)
    client.start_conversion_thread()
    exit_code = client.process_files()
    sys.exit(exit_code)