Esempio n. 1
0
    def process_ocr_logs(self):
        for log_file in os.listdir(self.args.ocr_output_folder):
            if not log_file.endswith(".txt"):
                continue
            broken_files = list()
            log_is_completed = False
            log_file_full_path = os.path.join(self.args.ocr_output_folder,
                                              log_file)
            try:
                with open(log_file_full_path,
                          "r",
                          encoding="utf-16-le",
                          errors="ignore") as inp:
                    for line in inp:
                        if line.find('Error:') != -1:
                            m = re.search('[a-f0-9]{64}.pdf', line)
                            if m is not None:
                                file_path = os.path.join(
                                    self.args.ocr_input_folder, m.group(0))
                                broken_files.append(file_path)
                        if line.find('Pages processed') != -1:
                            log_is_completed = True
            except Exception as exp:
                self.logger.error("fail to read \"{}\", exception: {}".format(
                    log_file, exp))
                continue

            if not log_is_completed:
                self.logger.debug(
                    "skip incomplete log_file \"{}\"".format(log_file))
                continue
            self.logger.debug(
                "process log_file \"{}\" with {} broken files".format(
                    log_file, len(broken_files)))
            try:
                shutil.move(
                    log_file_full_path,
                    os.path.join(self.args.ocr_logs_folder,
                                 log_file + "." + str(time.time())))
            except Exception as exp:
                self.logger.error("exception: {}".format(exp))

            for filename in broken_files:
                if os.path.exists(filename):
                    if not TConvertStorage.is_normal_input_file_name(filename):
                        self.convert_storage.delete_file_silently(filename)
                    else:
                        sha256 = TConvertStorage.get_sha256_from_filename(
                            filename)
                        self.register_ocr_process_finish(
                            self.ocr_tasks.get(sha256), False)
                        self.convert_storage.save_converted_file_broken_stub(
                            sha256, True)
                        self.logger.debug(
                            "remove {}, since ocr cannot process it (\"{}\")".
                            format(filename, log_file))
                        self.convert_storage.delete_file_silently(filename)
    def setup_server(self,
                     name,
                     addit_server_args=list(),
                     start_process=False):
        self.name = name
        self.data_folder = os.path.join(os.path.dirname(__file__),
                                        "data.{}".format(name))

        recreate_folder(self.data_folder)

        os.chdir(self.data_folder)
        input_files = "input_files"
        recreate_folder(input_files)

        db_converted_files = os.path.join(self.data_folder,
                                          "db_converted_files")
        recreate_folder(db_converted_files)

        db_input_files = os.path.join(self.data_folder, "db_input_files")
        recreate_folder(db_input_files)

        log_file = "db_conv.log"
        if os.path.exists(log_file):
            os.unlink(log_file)

        clear_folder_with_retry(self.pdf_ocr_folder)
        clear_folder_with_retry(self.pdf_ocr_out_folder)
        TConvertStorage.create_empty_db(db_input_files, db_converted_files,
                                        self.project_file)

        self.server_args = [
            "--server-address", self.server_address, '--logfile', log_file,
            '--db-json', self.project_file, '--disable-killing-winword',
            '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder',
            self.pdf_ocr_out_folder, '--disable-telegram'
        ] + addit_server_args

        if start_process:
            server_script = os.path.join(os.path.dirname(__file__), "..",
                                         "conv_storage_server.py")
            args = ["python", server_script] + self.server_args
            self.server_process = subprocess.Popen(args,
                                                   stderr=subprocess.DEVNULL,
                                                   stdout=subprocess.DEVNULL)
        else:
            self.start_server_thread()
Esempio n. 3
0
    def __init__(self, args):
        self.args = args
        self.logger = setup_logging(args.logfile)
        self.convert_storage = None

        self.server_actions_thread = None
        self.stop_input_thread = False
        self.input_task_queue = queue.Queue()
        self.ocr_tasks = dict()
        self.all_put_files_count = 0
        self.input_files_size = 0
        self.processed_files_size = 0
        self.failed_files_size = 0
        self.successful_get_requests = 0
        self.finished_ocr_tasks = 0
        self.hot_folder_path = None
        if sys.platform.startswith('win32'):
            self.hot_folder_path = self.get_hot_folder_path_from_running_tasks(
            )
            if self.hot_folder_path is None:
                raise Exception("cannot find running HotFolder.exe")

        self.file_garbage_collection_timestamp = 0
        self.self_server_ping_timestamp = 0
        self.ocr_queue_is_empty_last_time_stamp = time.time()
        self.got_ocred_file_last_time_stamp = time.time()
        self.http_server_is_working = False
        self.convert_storage = TConvertStorage(self.logger, args.db_json,
                                               args.user_bin_file_size)
        self.continuous_winword_failures_count = 0
        if args.clear_json:
            self.convert_storage.clear_database()
        self.create_folders()
        host, port = self.args.server_address.split(":")
        super().__init__((host, int(port)), THttpServerRequestHandler)
        if shutil.which("qpdf") is None:
            msg = "cannot find qpdf, sudo apt install qpdf"
            self.logger.error(msg)
            raise Exception(msg)
        if shutil.which("pdfcrack") is None:
            msg = "cannot find pdfcrack\nsee https://sourceforge.net/projects/pdfcrack/files/"
            self.logger.error(msg)
            raise Exception(msg)
        self.send_to_telegram("conversion server started on {}".format(
            self.args.server_address))
Esempio n. 4
0
 def process_stalled_files(self):
     current_time = time.time()
     for pdf_file in os.listdir(self.args.ocr_input_folder):
         fpath = os.path.join(self.args.ocr_input_folder, pdf_file)
         timestamp = Path(fpath).stat().st_mtime
         if current_time - timestamp > self.args.ocr_timeout:
             self.logger.error(
                 "delete orphan file {} after stalling {} seconds".format(
                     fpath, self.args.ocr_timeout))
             self.convert_storage.delete_file_silently(fpath)
             sha256 = TConvertStorage.get_sha256_from_filename(pdf_file)
             self.register_ocr_process_finish(self.ocr_tasks.get(sha256),
                                              False)
Esempio n. 5
0
    def process_docx_from_ocr(self):
        new_files_in_db = False
        for docx_file in os.listdir(self.args.ocr_output_folder):
            if not docx_file.endswith(".docx"):
                continue
            docx_file = os.path.join(self.args.ocr_output_folder, docx_file)
            input_task = self.ocr_tasks.get(
                TConvertStorage.get_sha256_from_filename(docx_file))
            if input_task is None:
                self.logger.debug(
                    "remove a converted file from unknown sources ".format(
                        docx_file))
                self.convert_storage.delete_file_silently(docx_file)
            else:
                for try_index in [1, 2, 3]:
                    self.logger.info(
                        "got file {} from ocr try to move it, trial No {}".
                        format(docx_file, try_index))
                    try:
                        self.convert_storage.save_converted_file(
                            docx_file, input_task.sha256, "ocr",
                            input_task.force)
                        self.register_ocr_process_finish(input_task, True)
                        new_files_in_db = True
                        break
                    except Exception as exp:
                        # under windows it should raise an exception if ocr is still writing to this file
                        self.logger.error(
                            "Exception {}, sleep 60 seconds ...".format(
                                str(exp)))
                        time.sleep(60)

                # delete tmp stripped pdf file, the input file is in storage
                self.convert_storage.delete_file_silently(
                    os.path.join(self.args.ocr_input_folder,
                                 input_task.sha256 + ".pdf"))

                if os.path.exists(docx_file):
                    self.logger.debug(
                        "cannot access {} in 3 tries, remove it".format(
                            docx_file))
                    self.register_ocr_process_finish(input_task, False)
                    self.convert_storage.delete_file_silently(docx_file)

        return new_files_in_db
Esempio n. 6
0

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--db-json", dest='db_json', required=True)
    parser.add_argument("--file-no", dest='file_no', type=int)
    parser.add_argument("--fix-file-offset",
                        dest='fix_file_offset',
                        action="store_true",
                        default=False)
    parser.add_argument("--disable-converted-storage-check",
                        dest='check_converted_storage',
                        action="store_false",
                        default=True)
    parser.add_argument("--disable-input-file-storage-check",
                        dest='check_input_file_storage',
                        action="store_false",
                        default=True)
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    logger = setup_logging(log_file_name="check_snowball.log")
    convert_storage = TConvertStorage(logger, args.db_json)
    convert_storage.check_storage(
        args.file_no,
        fix_file_offset=args.fix_file_offset,
        check_converted_storage=args.check_converted_storage,
        check_input_file_storage=args.check_input_file_storage)
Esempio n. 7
0
class TConvertProcessor(http.server.HTTPServer):
    pause_service_actions_file_path = ".pause"

    @staticmethod
    def parse_args(arglist):
        parser = argparse.ArgumentParser()
        parser.add_argument(
            "--server-address",
            dest='server_address',
            default=None,
            help=
            "by default read it from environment variable DECLARATOR_CONV_URL")
        parser.add_argument("--logfile", dest='logfile', default='db_conv.log')
        parser.add_argument("--db-json", dest='db_json', required=True)
        parser.add_argument("--clear-db",
                            dest='clear_json',
                            required=False,
                            action="store_true")
        parser.add_argument("--disable-ocr",
                            dest='enable_ocr',
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument(
            "--use-abiword",
            dest='use_abiword',
            default=False,
            required=False,
            action="store_true",
            help="use abiword to convert pdf to docx (test purposes)")
        parser.add_argument("--disable-winword",
                            dest='enable_winword',
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--input-folder",
                            dest='input_folder',
                            required=False,
                            default="input_files")
        parser.add_argument("--input-folder-cracked",
                            dest='input_folder_cracked',
                            required=False,
                            default="input_files_cracked")
        parser.add_argument("--ocr-input-folder",
                            dest='ocr_input_folder',
                            required=False,
                            default="pdf.ocr")
        parser.add_argument("--ocr-output-folder",
                            dest='ocr_output_folder',
                            required=False,
                            default="pdf.ocr.out")
        parser.add_argument("--ocr-logs-folder",
                            dest='ocr_logs_folder',
                            required=False,
                            default="ocr.logs")

        # max time between putting file to ocr queue and getting the result
        parser.add_argument(
            "--ocr-timeout",
            dest='ocr_timeout',
            required=False,
            help=
            "delete file if ocr cannot process it in this timeout, default 3h",
            default="3h")

        parser.add_argument(
            "--winword-timeout",
            dest='winword_timeout',
            required=False,
            help=
            "stop winword (that was called by MicrosoftPdf2Docx) if it processes file longer than timeout",
            default="60s")

        parser.add_argument(
            "--microsoft-pdf-2-docx",
            dest='microsoft_pdf_2_docx',
            required=False,
            default=
            "C:/tmp/smart_parser/smart_parser/tools/MicrosoftPdf2Docx/bin/Debug/MicrosoftPdf2Docx.exe"
        )
        parser.add_argument("--disable-killing-winword",
                            dest='use_winword_exlusively',
                            default=True,
                            required=False,
                            action="store_false")
        parser.add_argument("--request-rate-serialize",
                            dest='request_rate_serialize',
                            default=100,
                            required=False,
                            type=int,
                            help="save db on each Nth get request")

        # if the ocr queue is not empty and ocr produces no results in  1 hour, we have to restart ocr
        parser.add_argument("--ocr-restart-time",
                            dest='ocr_restart_time',
                            required=False,
                            help="restart ocr if it produces no results",
                            default="3h")

        parser.add_argument("--central-heart-rate",
                            dest='central_heart_rate',
                            type=int,
                            required=False,
                            default='10')
        parser.add_argument("--bin-file-size",
                            dest='user_bin_file_size',
                            type=int,
                            required=False)
        parser.add_argument("--disable-telegram",
                            dest="enable_telegram",
                            default=True,
                            required=False,
                            action="store_false")

        args = parser.parse_args(arglist)
        args.ocr_timeout = convert_to_seconds(args.ocr_timeout)
        args.ocr_restart_time = convert_to_seconds(args.ocr_restart_time)
        args.winword_timeout = convert_to_seconds(args.winword_timeout)
        if args.server_address is None:
            args.server_address = os.environ['DECLARATOR_CONV_URL']
        return args

    def __init__(self, args):
        self.args = args
        self.logger = setup_logging(args.logfile)
        self.convert_storage = None

        self.server_actions_thread = None
        self.stop_input_thread = False
        self.input_task_queue = queue.Queue()
        self.ocr_tasks = dict()
        self.all_put_files_count = 0
        self.input_files_size = 0
        self.processed_files_size = 0
        self.failed_files_size = 0
        self.successful_get_requests = 0
        self.finished_ocr_tasks = 0
        self.hot_folder_path = None
        if sys.platform.startswith('win32'):
            self.hot_folder_path = self.get_hot_folder_path_from_running_tasks(
            )
            if self.hot_folder_path is None:
                raise Exception("cannot find running HotFolder.exe")

        self.file_garbage_collection_timestamp = 0
        self.self_server_ping_timestamp = 0
        self.ocr_queue_is_empty_last_time_stamp = time.time()
        self.got_ocred_file_last_time_stamp = time.time()
        self.http_server_is_working = False
        self.convert_storage = TConvertStorage(self.logger, args.db_json,
                                               args.user_bin_file_size)
        self.continuous_winword_failures_count = 0
        if args.clear_json:
            self.convert_storage.clear_database()
        self.create_folders()
        host, port = self.args.server_address.split(":")
        super().__init__((host, int(port)), THttpServerRequestHandler)
        if shutil.which("qpdf") is None:
            msg = "cannot find qpdf, sudo apt install qpdf"
            self.logger.error(msg)
            raise Exception(msg)
        if shutil.which("pdfcrack") is None:
            msg = "cannot find pdfcrack\nsee https://sourceforge.net/projects/pdfcrack/files/"
            self.logger.error(msg)
            raise Exception(msg)
        self.send_to_telegram("conversion server started on {}".format(
            self.args.server_address))

    def get_hot_folder_path_from_running_tasks(self):
        p1 = subprocess.run(['wmic', 'process', 'get', 'ExecutablePath'],
                            capture_output=True)
        for x in p1.stdout.decode('utf8', errors="ignore").split("\n"):
            if x.find('HotFolder.exe') != -1:
                return x.strip(" \r\n")

    def send_to_telegram(self, message):
        if self.args.enable_telegram:
            self.logger.debug("send to telegram: {}".format(message))
            telegram_send.send(messages=[message])

    def start_http_server(self):
        self.logger.debug("myServer.serve_forever(): {}".format(
            self.args.server_address))
        self.http_server_is_working = True
        self.server_actions_thread = threading.Thread(
            target=self.service_actions_in_a_thread)
        # Exit the server thread when the main thread terminates
        # eah, I remember when I used to believe in things too.
        # https://stackoverflow.com/questions/21843916/python-daemon-thread-does-not-exit-when-parent-thread-exits
        self.server_actions_thread.daemon = True
        self.server_actions_thread.start()
        try:
            self.serve_forever()
        except Exception as exp:
            self.logger.error("exit due exception {}".format(exp))
            self.stop_http_server()

    def stop_http_server(self, run_shutdown=True):
        if self.http_server_is_working:
            self.logger.debug("try to stop http server  ")
            self.http_server_is_working = False
            self.server_close()
            if run_shutdown:
                self.shutdown()
                stop_timeout = 60
                self.logger.debug(
                    "try to join server_actions_thread in {} seconds".format(
                        stop_timeout))
                self.server_actions_thread.join(stop_timeout)
                if self.server_actions_thread.is_alive():
                    raise Exception(
                        "cannot stop server_actions_thread in {} seconds".
                        format(stop_timeout))
            try:
                if os.path.exists(self.args.input_folder_cracked):
                    shutil.rmtree(self.args.input_folder_cracked,
                                  ignore_errors=False)
            except Exception as exp:
                self.logger.error(exp)
            self.logger.debug("http server was stopped")
            self.convert_storage.close_storage()
            self.logger.debug("storage was closed")
            close_logger(self.logger)
            print("stop_http_server ends")

    def save_new_file(self,
                      sha256,
                      file_bytes,
                      file_extension,
                      force,
                      only_winword_conversion=False,
                      only_ocr=False):
        filename = os.path.join(self.args.input_folder,
                                sha256 + file_extension)
        if os.path.exists(filename):  # already registered as an input task
            return False
        with open(filename, 'wb') as output_file:
            output_file.write(file_bytes)
        self.logger.debug("save new file {} ".format(filename))
        task = TInputTask(filename,
                          sha256,
                          len(file_bytes),
                          force,
                          only_winword_conversion=only_winword_conversion,
                          only_ocr=only_ocr)
        self.input_files_size += task.file_size
        self.input_task_queue.put(task)
        return True

    def register_file_process_finish(self, input_task: TInputTask,
                                     process_result):
        self.input_files_size -= input_task.file_size
        if process_result:
            self.processed_files_size += input_task.file_size
        else:
            self.failed_files_size += input_task.file_size

    def register_ocr_process_finish(self, input_task: TInputTask,
                                    process_result):
        if input_task is not None:
            self.register_file_process_finish(input_task, process_result)
            if input_task.sha256 in self.ocr_tasks:
                del self.ocr_tasks[input_task.sha256]
                self.finished_ocr_tasks += 1

    def kill_winword(self):
        if self.args.use_winword_exlusively:
            taskkill_windows('winword.exe')
        taskkill_windows('pdfreflow.exe')

    def convert_with_microsoft_word(self, filename):
        if not self.args.enable_winword:
            return
        self.logger.info("convert {} with microsoft word".format(filename))
        self.kill_winword()
        docx_file = filename + ".docx"
        try:
            status = subprocess.run([self.args.microsoft_pdf_2_docx, filename],
                                    timeout=self.args.winword_timeout,
                                    capture_output=True)
            success = (status.returncode == 0 and os.path.exists(docx_file))
            if not success:
                winword_errors = status.stderr.decode("utf8").replace(
                    "\n", " ").strip()
                self.logger.debug(winword_errors)
        except Exception as exp:
            success = False
            self.logger.error(
                "Exception {} in winword while processing {}".format(
                    exp, filename))
            pass
        self.kill_winword()

        if success:
            self.continuous_winword_failures_count = 0
            return docx_file
        else:
            if not os.path.exists(docx_file) or os.path.getsize(
                    docx_file) == 0:
                self.continuous_winword_failures_count += 1
                if self.continuous_winword_failures_count > 20:
                    self.send_to_telegram(
                        "pdf conversion server:continuous_winword_failures_count = {}"
                        .format(self.continuous_winword_failures_count))
            return None

    def process_one_input_file(self, input_task: TInputTask):
        input_file = input_task.file_path
        basename = os.path.basename(input_file)
        stripped_file = os.path.join(self.args.input_folder_cracked, basename)
        self.logger.debug("process input file {}, pwd={}".format(
            input_file, os.getcwd()))
        strip_drm(self.logger, input_file, stripped_file)

        if not self.http_server_is_working:
            return

        docxfile = None if input_task.only_ocr else self.convert_with_microsoft_word(
            stripped_file)
        if docxfile is not None:
            self.convert_storage.delete_file_silently(stripped_file)
            self.convert_storage.save_converted_file(docxfile,
                                                     input_task.sha256, "word",
                                                     input_task.force)
            self.convert_storage.save_input_file(input_file)
            self.register_file_process_finish(input_task, True)
        else:
            if not self.args.enable_ocr or input_task.only_winword_conversion:
                self.logger.info(
                    "cannot process {}, delete it".format(input_file))
                self.convert_storage.delete_file_silently(input_file)
                self.convert_storage.delete_file_silently(stripped_file)
                self.register_file_process_finish(input_task, False)
            else:
                if self.args.use_abiword:
                    docx_path = stripped_file + ".docx"
                    self.logger.debug("abiword {} to {}".format(
                        stripped_file, docx_path))
                    convert_pdf_to_docx_with_abiword(stripped_file, docx_path)
                    self.convert_storage.save_converted_file(
                        docx_path, input_task.sha256, "abiword",
                        input_task.force)
                    self.convert_storage.save_input_file(input_file)
                else:
                    self.logger.info("move {} to {}".format(
                        stripped_file, self.args.ocr_input_folder))
                    move_file_with_retry(self.logger, stripped_file,
                                         self.args.ocr_input_folder)
                    self.convert_storage.save_input_file(input_file)
                    self.ocr_tasks[input_task.sha256] = input_task

    def create_cracked_folder(self):
        cracked_prefix = 'input_files_cracked'
        for x in os.listdir('.'):
            if x.startswith(cracked_prefix):
                self.logger.debug("rm {}".format(x))
                shutil.rmtree(x, ignore_errors=True)
        self.args.input_folder_cracked = tempfile.mkdtemp(
            prefix=cracked_prefix, dir=".")
        self.logger.debug("input_folder_cracked = {}".format(
            self.args.input_folder_cracked))
        assert os.path.isdir(self.args.input_folder_cracked)

    def create_folders(self):
        self.logger.debug("use {} as  microsoft word converter".format(
            self.args.microsoft_pdf_2_docx))

        if os.path.exists(self.args.input_folder
                          ):  #no way to process the input files without queue
            shutil.rmtree(self.args.input_folder, ignore_errors=True)
        if not os.path.exists(self.args.input_folder):
            os.mkdir(self.args.input_folder)
        if not os.path.exists(self.args.ocr_logs_folder):
            os.mkdir(self.args.ocr_logs_folder)
        self.logger.debug("input folder for new files: {} ".format(
            self.args.input_folder))

        if not os.path.exists(self.args.ocr_output_folder):
            os.mkdir(self.args.ocr_output_folder)
        if not os.path.exists(self.args.ocr_input_folder):
            os.mkdir(self.args.ocr_input_folder)
        if self.args.enable_winword:
            assert os.path.exists(self.args.microsoft_pdf_2_docx)
        self.create_cracked_folder()

    def process_ocr_logs(self):
        for log_file in os.listdir(self.args.ocr_output_folder):
            if not log_file.endswith(".txt"):
                continue
            broken_files = list()
            log_is_completed = False
            log_file_full_path = os.path.join(self.args.ocr_output_folder,
                                              log_file)
            try:
                with open(log_file_full_path,
                          "r",
                          encoding="utf-16-le",
                          errors="ignore") as inp:
                    for line in inp:
                        if line.find('Error:') != -1:
                            m = re.search('[a-f0-9]{64}.pdf', line)
                            if m is not None:
                                file_path = os.path.join(
                                    self.args.ocr_input_folder, m.group(0))
                                broken_files.append(file_path)
                        if line.find('Pages processed') != -1:
                            log_is_completed = True
            except Exception as exp:
                self.logger.error("fail to read \"{}\", exception: {}".format(
                    log_file, exp))
                continue

            if not log_is_completed:
                self.logger.debug(
                    "skip incomplete log_file \"{}\"".format(log_file))
                continue
            self.logger.debug(
                "process log_file \"{}\" with {} broken files".format(
                    log_file, len(broken_files)))
            try:
                shutil.move(
                    log_file_full_path,
                    os.path.join(self.args.ocr_logs_folder,
                                 log_file + "." + str(time.time())))
            except Exception as exp:
                self.logger.error("exception: {}".format(exp))

            for filename in broken_files:
                if os.path.exists(filename):
                    if not TConvertStorage.is_normal_input_file_name(filename):
                        self.convert_storage.delete_file_silently(filename)
                    else:
                        sha256 = TConvertStorage.get_sha256_from_filename(
                            filename)
                        self.register_ocr_process_finish(
                            self.ocr_tasks.get(sha256), False)
                        self.convert_storage.save_converted_file_broken_stub(
                            sha256, True)
                        self.logger.debug(
                            "remove {}, since ocr cannot process it (\"{}\")".
                            format(filename, log_file))
                        self.convert_storage.delete_file_silently(filename)

    def try_convert_with_winword(self):
        files_count = 0
        while not self.input_task_queue.empty():
            task = self.input_task_queue.get()
            if not self.http_server_is_working:
                return

            try:
                self.process_one_input_file(task)
                files_count += 1
                if files_count >= 80:
                    break  # just give a chance to accomplish other tasks, then return to these tasks
            except Exception as exp:
                self.logger.error("Exception: {}".format(exp))
                if os.path.exists(task.file_path):
                    self.logger.error("delete {}".format(task.file_path))
                    os.unlink(task.file_path)

    def process_docx_from_ocr(self):
        new_files_in_db = False
        for docx_file in os.listdir(self.args.ocr_output_folder):
            if not docx_file.endswith(".docx"):
                continue
            docx_file = os.path.join(self.args.ocr_output_folder, docx_file)
            input_task = self.ocr_tasks.get(
                TConvertStorage.get_sha256_from_filename(docx_file))
            if input_task is None:
                self.logger.debug(
                    "remove a converted file from unknown sources ".format(
                        docx_file))
                self.convert_storage.delete_file_silently(docx_file)
            else:
                for try_index in [1, 2, 3]:
                    self.logger.info(
                        "got file {} from ocr try to move it, trial No {}".
                        format(docx_file, try_index))
                    try:
                        self.convert_storage.save_converted_file(
                            docx_file, input_task.sha256, "ocr",
                            input_task.force)
                        self.register_ocr_process_finish(input_task, True)
                        new_files_in_db = True
                        break
                    except Exception as exp:
                        # under windows it should raise an exception if ocr is still writing to this file
                        self.logger.error(
                            "Exception {}, sleep 60 seconds ...".format(
                                str(exp)))
                        time.sleep(60)

                # delete tmp stripped pdf file, the input file is in storage
                self.convert_storage.delete_file_silently(
                    os.path.join(self.args.ocr_input_folder,
                                 input_task.sha256 + ".pdf"))

                if os.path.exists(docx_file):
                    self.logger.debug(
                        "cannot access {} in 3 tries, remove it".format(
                            docx_file))
                    self.register_ocr_process_finish(input_task, False)
                    self.convert_storage.delete_file_silently(docx_file)

        return new_files_in_db

    def get_stats(self):

        try:
            ocr_pending_all_file_size = sum(x.file_size
                                            for x in self.ocr_tasks.values())
            input_task_queue = self.input_task_queue.qsize()
            ocr_tasks_count = len(self.ocr_tasks)
            return {
                'all_put_files_count':
                self.all_put_files_count,
                'successful_get_requests_count':
                self.successful_get_requests,
                # normally input_task_queue == input_folder_files_count
                'input_task_queue':
                input_task_queue,
                'input_folder_files_count':
                len(os.listdir(self.args.input_folder)),

                # normally ocr_pending_files_count == ocr_tasks_count
                'ocr_pending_files_count':
                len(os.listdir(self.args.ocr_input_folder)),
                'ocr_tasks_count':
                ocr_tasks_count,
                'ocr_pending_all_file_size':
                ocr_pending_all_file_size,
                'is_converting':
                input_task_queue > 0 or ocr_tasks_count > 0,
                'processed_files_size':
                self.processed_files_size,
                'failed_files_size':
                self.failed_files_size,
                'finished_ocr_tasks':
                self.finished_ocr_tasks,
                'snow_ball_os_error_count':
                self.convert_storage.snow_ball_os_error_count,
                "pause_service_actions":
                self.pause_service_actions(),
            }
        except Exception as exp:
            return {"exception": str(exp)}

    def process_stalled_files(self):
        current_time = time.time()
        for pdf_file in os.listdir(self.args.ocr_input_folder):
            fpath = os.path.join(self.args.ocr_input_folder, pdf_file)
            timestamp = Path(fpath).stat().st_mtime
            if current_time - timestamp > self.args.ocr_timeout:
                self.logger.error(
                    "delete orphan file {} after stalling {} seconds".format(
                        fpath, self.args.ocr_timeout))
                self.convert_storage.delete_file_silently(fpath)
                sha256 = TConvertStorage.get_sha256_from_filename(pdf_file)
                self.register_ocr_process_finish(self.ocr_tasks.get(sha256),
                                                 False)

    def restart_ocr(self):
        self.logger.debug("restart ocr")
        self.logger.debug("taskkill HotFolder.exe")
        taskkill_windows('HotFolder.exe')

        self.logger.debug("taskkill fineexec.exe")
        taskkill_windows('FineExec.exe')

        self.logger.debug("start HotFolder.exe")
        creationflags = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP | \
                                subprocess.CREATE_BREAKAWAY_FROM_JOB | subprocess.SW_HIDE
        subprocess.Popen([self.hot_folder_path],
                         creationflags=creationflags,
                         stdin=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL,
                         stdout=subprocess.DEVNULL,
                         cwd="c:/")

    def process_all_tasks(self):
        if len(self.ocr_tasks) == 0:
            self.ocr_queue_is_empty_last_time_stamp = time.time()
        self.try_convert_with_winword()
        new_files_from_ocr = self.process_docx_from_ocr()
        if new_files_from_ocr:
            self.got_ocred_file_last_time_stamp = time.time()
        # file garbage tasks
        current_time = time.time()
        if current_time - self.file_garbage_collection_timestamp >= 60:  # just not too often
            self.file_garbage_collection_timestamp = current_time
            if not self.http_server_is_working:
                return
            self.process_ocr_logs()
            if not self.http_server_is_working:
                return
            self.process_stalled_files()

        if current_time - self.self_server_ping_timestamp >= 3600:  # just not too often
            args = TDocConversionClient.parse_args(
                ["--server-address", self.args.server_address])
            client = TDocConversionClient(args, self.logger)
            if not client.assert_declarator_conv_alive(raise_exception=False):
                self.logger.error("cannot ping itself, exit")
                self.stop_http_server(run_shutdown=False)
                sys.exit(1)
            self.self_server_ping_timestamp = current_time

        current_time = time.time()
        if  current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \
                current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time :
            self.logger.debug(
                "last ocr file was received long ago and all this time the ocr queue was not empty"
            )
            if not self.http_server_is_working:
                return
            self.restart_ocr()
            self.got_ocred_file_last_time_stamp = time.time(
            )  #otherwize restart will be too often

    def pause_service_actions(self):
        return os.path.exists(self.pause_service_actions_file_path)

    def service_actions_in_a_thread(self):
        last_heart_beat = time.time()
        while self.http_server_is_working:
            if time.time() - last_heart_beat >= self.args.central_heart_rate:
                if not self.pause_service_actions():
                    try:
                        self.process_all_tasks()
                    except Exception as exp:
                        if self.logger is not None:
                            self.logger.error(exp)
                        self.stop_http_server(run_shutdown=False)
                        sys.exit(1)
                last_heart_beat = time.time()
            else:
                time.sleep(1)