def process_ocr_logs(self): for log_file in os.listdir(self.args.ocr_output_folder): if not log_file.endswith(".txt"): continue broken_files = list() log_is_completed = False log_file_full_path = os.path.join(self.args.ocr_output_folder, log_file) try: with open(log_file_full_path, "r", encoding="utf-16-le", errors="ignore") as inp: for line in inp: if line.find('Error:') != -1: m = re.search('[a-f0-9]{64}.pdf', line) if m is not None: file_path = os.path.join( self.args.ocr_input_folder, m.group(0)) broken_files.append(file_path) if line.find('Pages processed') != -1: log_is_completed = True except Exception as exp: self.logger.error("fail to read \"{}\", exception: {}".format( log_file, exp)) continue if not log_is_completed: self.logger.debug( "skip incomplete log_file \"{}\"".format(log_file)) continue self.logger.debug( "process log_file \"{}\" with {} broken files".format( log_file, len(broken_files))) try: shutil.move( log_file_full_path, os.path.join(self.args.ocr_logs_folder, log_file + "." + str(time.time()))) except Exception as exp: self.logger.error("exception: {}".format(exp)) for filename in broken_files: if os.path.exists(filename): if not TConvertStorage.is_normal_input_file_name(filename): self.convert_storage.delete_file_silently(filename) else: sha256 = TConvertStorage.get_sha256_from_filename( filename) self.register_ocr_process_finish( self.ocr_tasks.get(sha256), False) self.convert_storage.save_converted_file_broken_stub( sha256, True) self.logger.debug( "remove {}, since ocr cannot process it (\"{}\")". format(filename, log_file)) self.convert_storage.delete_file_silently(filename)
def setup_server(self, name, addit_server_args=list(), start_process=False): self.name = name self.data_folder = os.path.join(os.path.dirname(__file__), "data.{}".format(name)) recreate_folder(self.data_folder) os.chdir(self.data_folder) input_files = "input_files" recreate_folder(input_files) db_converted_files = os.path.join(self.data_folder, "db_converted_files") recreate_folder(db_converted_files) db_input_files = os.path.join(self.data_folder, "db_input_files") recreate_folder(db_input_files) log_file = "db_conv.log" if os.path.exists(log_file): os.unlink(log_file) clear_folder_with_retry(self.pdf_ocr_folder) clear_folder_with_retry(self.pdf_ocr_out_folder) TConvertStorage.create_empty_db(db_input_files, db_converted_files, self.project_file) self.server_args = [ "--server-address", self.server_address, '--logfile', log_file, '--db-json', self.project_file, '--disable-killing-winword', '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder', self.pdf_ocr_out_folder, '--disable-telegram' ] + addit_server_args if start_process: server_script = os.path.join(os.path.dirname(__file__), "..", "conv_storage_server.py") args = ["python", server_script] + self.server_args self.server_process = subprocess.Popen(args, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) else: self.start_server_thread()
def __init__(self, args): self.args = args self.logger = setup_logging(args.logfile) self.convert_storage = None self.server_actions_thread = None self.stop_input_thread = False self.input_task_queue = queue.Queue() self.ocr_tasks = dict() self.all_put_files_count = 0 self.input_files_size = 0 self.processed_files_size = 0 self.failed_files_size = 0 self.successful_get_requests = 0 self.finished_ocr_tasks = 0 self.hot_folder_path = None if sys.platform.startswith('win32'): self.hot_folder_path = self.get_hot_folder_path_from_running_tasks( ) if self.hot_folder_path is None: raise Exception("cannot find running HotFolder.exe") self.file_garbage_collection_timestamp = 0 self.self_server_ping_timestamp = 0 self.ocr_queue_is_empty_last_time_stamp = time.time() self.got_ocred_file_last_time_stamp = time.time() self.http_server_is_working = False self.convert_storage = TConvertStorage(self.logger, args.db_json, args.user_bin_file_size) self.continuous_winword_failures_count = 0 if args.clear_json: self.convert_storage.clear_database() self.create_folders() host, port = self.args.server_address.split(":") super().__init__((host, int(port)), THttpServerRequestHandler) if shutil.which("qpdf") is None: msg = "cannot find qpdf, sudo apt install qpdf" self.logger.error(msg) raise Exception(msg) if shutil.which("pdfcrack") is None: msg = "cannot find pdfcrack\nsee https://sourceforge.net/projects/pdfcrack/files/" self.logger.error(msg) raise Exception(msg) self.send_to_telegram("conversion server started on {}".format( self.args.server_address))
def process_stalled_files(self): current_time = time.time() for pdf_file in os.listdir(self.args.ocr_input_folder): fpath = os.path.join(self.args.ocr_input_folder, pdf_file) timestamp = Path(fpath).stat().st_mtime if current_time - timestamp > self.args.ocr_timeout: self.logger.error( "delete orphan file {} after stalling {} seconds".format( fpath, self.args.ocr_timeout)) self.convert_storage.delete_file_silently(fpath) sha256 = TConvertStorage.get_sha256_from_filename(pdf_file) self.register_ocr_process_finish(self.ocr_tasks.get(sha256), False)
def process_docx_from_ocr(self): new_files_in_db = False for docx_file in os.listdir(self.args.ocr_output_folder): if not docx_file.endswith(".docx"): continue docx_file = os.path.join(self.args.ocr_output_folder, docx_file) input_task = self.ocr_tasks.get( TConvertStorage.get_sha256_from_filename(docx_file)) if input_task is None: self.logger.debug( "remove a converted file from unknown sources ".format( docx_file)) self.convert_storage.delete_file_silently(docx_file) else: for try_index in [1, 2, 3]: self.logger.info( "got file {} from ocr try to move it, trial No {}". format(docx_file, try_index)) try: self.convert_storage.save_converted_file( docx_file, input_task.sha256, "ocr", input_task.force) self.register_ocr_process_finish(input_task, True) new_files_in_db = True break except Exception as exp: # under windows it should raise an exception if ocr is still writing to this file self.logger.error( "Exception {}, sleep 60 seconds ...".format( str(exp))) time.sleep(60) # delete tmp stripped pdf file, the input file is in storage self.convert_storage.delete_file_silently( os.path.join(self.args.ocr_input_folder, input_task.sha256 + ".pdf")) if os.path.exists(docx_file): self.logger.debug( "cannot access {} in 3 tries, remove it".format( docx_file)) self.register_ocr_process_finish(input_task, False) self.convert_storage.delete_file_silently(docx_file) return new_files_in_db
def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--db-json", dest='db_json', required=True) parser.add_argument("--file-no", dest='file_no', type=int) parser.add_argument("--fix-file-offset", dest='fix_file_offset', action="store_true", default=False) parser.add_argument("--disable-converted-storage-check", dest='check_converted_storage', action="store_false", default=True) parser.add_argument("--disable-input-file-storage-check", dest='check_input_file_storage', action="store_false", default=True) return parser.parse_args() if __name__ == '__main__': args = parse_args() logger = setup_logging(log_file_name="check_snowball.log") convert_storage = TConvertStorage(logger, args.db_json) convert_storage.check_storage( args.file_no, fix_file_offset=args.fix_file_offset, check_converted_storage=args.check_converted_storage, check_input_file_storage=args.check_input_file_storage)
class TConvertProcessor(http.server.HTTPServer): pause_service_actions_file_path = ".pause" @staticmethod def parse_args(arglist): parser = argparse.ArgumentParser() parser.add_argument( "--server-address", dest='server_address', default=None, help= "by default read it from environment variable DECLARATOR_CONV_URL") parser.add_argument("--logfile", dest='logfile', default='db_conv.log') parser.add_argument("--db-json", dest='db_json', required=True) parser.add_argument("--clear-db", dest='clear_json', required=False, action="store_true") parser.add_argument("--disable-ocr", dest='enable_ocr', default=True, required=False, action="store_false") parser.add_argument( "--use-abiword", dest='use_abiword', default=False, required=False, action="store_true", help="use abiword to convert pdf to docx (test purposes)") parser.add_argument("--disable-winword", dest='enable_winword', default=True, required=False, action="store_false") parser.add_argument("--input-folder", dest='input_folder', required=False, default="input_files") parser.add_argument("--input-folder-cracked", dest='input_folder_cracked', required=False, default="input_files_cracked") parser.add_argument("--ocr-input-folder", dest='ocr_input_folder', required=False, default="pdf.ocr") parser.add_argument("--ocr-output-folder", dest='ocr_output_folder', required=False, default="pdf.ocr.out") parser.add_argument("--ocr-logs-folder", dest='ocr_logs_folder', required=False, default="ocr.logs") # max time between putting file to ocr queue and getting the result parser.add_argument( "--ocr-timeout", dest='ocr_timeout', required=False, help= "delete file if ocr cannot process it in this timeout, default 3h", default="3h") parser.add_argument( "--winword-timeout", dest='winword_timeout', required=False, help= "stop winword (that was called by MicrosoftPdf2Docx) if it processes file longer than timeout", default="60s") parser.add_argument( "--microsoft-pdf-2-docx", dest='microsoft_pdf_2_docx', required=False, default= "C:/tmp/smart_parser/smart_parser/tools/MicrosoftPdf2Docx/bin/Debug/MicrosoftPdf2Docx.exe" ) parser.add_argument("--disable-killing-winword", dest='use_winword_exlusively', default=True, required=False, action="store_false") parser.add_argument("--request-rate-serialize", dest='request_rate_serialize', default=100, required=False, type=int, help="save db on each Nth get request") # if the ocr queue is not empty and ocr produces no results in 1 hour, we have to restart ocr parser.add_argument("--ocr-restart-time", dest='ocr_restart_time', required=False, help="restart ocr if it produces no results", default="3h") parser.add_argument("--central-heart-rate", dest='central_heart_rate', type=int, required=False, default='10') parser.add_argument("--bin-file-size", dest='user_bin_file_size', type=int, required=False) parser.add_argument("--disable-telegram", dest="enable_telegram", default=True, required=False, action="store_false") args = parser.parse_args(arglist) args.ocr_timeout = convert_to_seconds(args.ocr_timeout) args.ocr_restart_time = convert_to_seconds(args.ocr_restart_time) args.winword_timeout = convert_to_seconds(args.winword_timeout) if args.server_address is None: args.server_address = os.environ['DECLARATOR_CONV_URL'] return args def __init__(self, args): self.args = args self.logger = setup_logging(args.logfile) self.convert_storage = None self.server_actions_thread = None self.stop_input_thread = False self.input_task_queue = queue.Queue() self.ocr_tasks = dict() self.all_put_files_count = 0 self.input_files_size = 0 self.processed_files_size = 0 self.failed_files_size = 0 self.successful_get_requests = 0 self.finished_ocr_tasks = 0 self.hot_folder_path = None if sys.platform.startswith('win32'): self.hot_folder_path = self.get_hot_folder_path_from_running_tasks( ) if self.hot_folder_path is None: raise Exception("cannot find running HotFolder.exe") self.file_garbage_collection_timestamp = 0 self.self_server_ping_timestamp = 0 self.ocr_queue_is_empty_last_time_stamp = time.time() self.got_ocred_file_last_time_stamp = time.time() self.http_server_is_working = False self.convert_storage = TConvertStorage(self.logger, args.db_json, args.user_bin_file_size) self.continuous_winword_failures_count = 0 if args.clear_json: self.convert_storage.clear_database() self.create_folders() host, port = self.args.server_address.split(":") super().__init__((host, int(port)), THttpServerRequestHandler) if shutil.which("qpdf") is None: msg = "cannot find qpdf, sudo apt install qpdf" self.logger.error(msg) raise Exception(msg) if shutil.which("pdfcrack") is None: msg = "cannot find pdfcrack\nsee https://sourceforge.net/projects/pdfcrack/files/" self.logger.error(msg) raise Exception(msg) self.send_to_telegram("conversion server started on {}".format( self.args.server_address)) def get_hot_folder_path_from_running_tasks(self): p1 = subprocess.run(['wmic', 'process', 'get', 'ExecutablePath'], capture_output=True) for x in p1.stdout.decode('utf8', errors="ignore").split("\n"): if x.find('HotFolder.exe') != -1: return x.strip(" \r\n") def send_to_telegram(self, message): if self.args.enable_telegram: self.logger.debug("send to telegram: {}".format(message)) telegram_send.send(messages=[message]) def start_http_server(self): self.logger.debug("myServer.serve_forever(): {}".format( self.args.server_address)) self.http_server_is_working = True self.server_actions_thread = threading.Thread( target=self.service_actions_in_a_thread) # Exit the server thread when the main thread terminates # eah, I remember when I used to believe in things too. # https://stackoverflow.com/questions/21843916/python-daemon-thread-does-not-exit-when-parent-thread-exits self.server_actions_thread.daemon = True self.server_actions_thread.start() try: self.serve_forever() except Exception as exp: self.logger.error("exit due exception {}".format(exp)) self.stop_http_server() def stop_http_server(self, run_shutdown=True): if self.http_server_is_working: self.logger.debug("try to stop http server ") self.http_server_is_working = False self.server_close() if run_shutdown: self.shutdown() stop_timeout = 60 self.logger.debug( "try to join server_actions_thread in {} seconds".format( stop_timeout)) self.server_actions_thread.join(stop_timeout) if self.server_actions_thread.is_alive(): raise Exception( "cannot stop server_actions_thread in {} seconds". format(stop_timeout)) try: if os.path.exists(self.args.input_folder_cracked): shutil.rmtree(self.args.input_folder_cracked, ignore_errors=False) except Exception as exp: self.logger.error(exp) self.logger.debug("http server was stopped") self.convert_storage.close_storage() self.logger.debug("storage was closed") close_logger(self.logger) print("stop_http_server ends") def save_new_file(self, sha256, file_bytes, file_extension, force, only_winword_conversion=False, only_ocr=False): filename = os.path.join(self.args.input_folder, sha256 + file_extension) if os.path.exists(filename): # already registered as an input task return False with open(filename, 'wb') as output_file: output_file.write(file_bytes) self.logger.debug("save new file {} ".format(filename)) task = TInputTask(filename, sha256, len(file_bytes), force, only_winword_conversion=only_winword_conversion, only_ocr=only_ocr) self.input_files_size += task.file_size self.input_task_queue.put(task) return True def register_file_process_finish(self, input_task: TInputTask, process_result): self.input_files_size -= input_task.file_size if process_result: self.processed_files_size += input_task.file_size else: self.failed_files_size += input_task.file_size def register_ocr_process_finish(self, input_task: TInputTask, process_result): if input_task is not None: self.register_file_process_finish(input_task, process_result) if input_task.sha256 in self.ocr_tasks: del self.ocr_tasks[input_task.sha256] self.finished_ocr_tasks += 1 def kill_winword(self): if self.args.use_winword_exlusively: taskkill_windows('winword.exe') taskkill_windows('pdfreflow.exe') def convert_with_microsoft_word(self, filename): if not self.args.enable_winword: return self.logger.info("convert {} with microsoft word".format(filename)) self.kill_winword() docx_file = filename + ".docx" try: status = subprocess.run([self.args.microsoft_pdf_2_docx, filename], timeout=self.args.winword_timeout, capture_output=True) success = (status.returncode == 0 and os.path.exists(docx_file)) if not success: winword_errors = status.stderr.decode("utf8").replace( "\n", " ").strip() self.logger.debug(winword_errors) except Exception as exp: success = False self.logger.error( "Exception {} in winword while processing {}".format( exp, filename)) pass self.kill_winword() if success: self.continuous_winword_failures_count = 0 return docx_file else: if not os.path.exists(docx_file) or os.path.getsize( docx_file) == 0: self.continuous_winword_failures_count += 1 if self.continuous_winword_failures_count > 20: self.send_to_telegram( "pdf conversion server:continuous_winword_failures_count = {}" .format(self.continuous_winword_failures_count)) return None def process_one_input_file(self, input_task: TInputTask): input_file = input_task.file_path basename = os.path.basename(input_file) stripped_file = os.path.join(self.args.input_folder_cracked, basename) self.logger.debug("process input file {}, pwd={}".format( input_file, os.getcwd())) strip_drm(self.logger, input_file, stripped_file) if not self.http_server_is_working: return docxfile = None if input_task.only_ocr else self.convert_with_microsoft_word( stripped_file) if docxfile is not None: self.convert_storage.delete_file_silently(stripped_file) self.convert_storage.save_converted_file(docxfile, input_task.sha256, "word", input_task.force) self.convert_storage.save_input_file(input_file) self.register_file_process_finish(input_task, True) else: if not self.args.enable_ocr or input_task.only_winword_conversion: self.logger.info( "cannot process {}, delete it".format(input_file)) self.convert_storage.delete_file_silently(input_file) self.convert_storage.delete_file_silently(stripped_file) self.register_file_process_finish(input_task, False) else: if self.args.use_abiword: docx_path = stripped_file + ".docx" self.logger.debug("abiword {} to {}".format( stripped_file, docx_path)) convert_pdf_to_docx_with_abiword(stripped_file, docx_path) self.convert_storage.save_converted_file( docx_path, input_task.sha256, "abiword", input_task.force) self.convert_storage.save_input_file(input_file) else: self.logger.info("move {} to {}".format( stripped_file, self.args.ocr_input_folder)) move_file_with_retry(self.logger, stripped_file, self.args.ocr_input_folder) self.convert_storage.save_input_file(input_file) self.ocr_tasks[input_task.sha256] = input_task def create_cracked_folder(self): cracked_prefix = 'input_files_cracked' for x in os.listdir('.'): if x.startswith(cracked_prefix): self.logger.debug("rm {}".format(x)) shutil.rmtree(x, ignore_errors=True) self.args.input_folder_cracked = tempfile.mkdtemp( prefix=cracked_prefix, dir=".") self.logger.debug("input_folder_cracked = {}".format( self.args.input_folder_cracked)) assert os.path.isdir(self.args.input_folder_cracked) def create_folders(self): self.logger.debug("use {} as microsoft word converter".format( self.args.microsoft_pdf_2_docx)) if os.path.exists(self.args.input_folder ): #no way to process the input files without queue shutil.rmtree(self.args.input_folder, ignore_errors=True) if not os.path.exists(self.args.input_folder): os.mkdir(self.args.input_folder) if not os.path.exists(self.args.ocr_logs_folder): os.mkdir(self.args.ocr_logs_folder) self.logger.debug("input folder for new files: {} ".format( self.args.input_folder)) if not os.path.exists(self.args.ocr_output_folder): os.mkdir(self.args.ocr_output_folder) if not os.path.exists(self.args.ocr_input_folder): os.mkdir(self.args.ocr_input_folder) if self.args.enable_winword: assert os.path.exists(self.args.microsoft_pdf_2_docx) self.create_cracked_folder() def process_ocr_logs(self): for log_file in os.listdir(self.args.ocr_output_folder): if not log_file.endswith(".txt"): continue broken_files = list() log_is_completed = False log_file_full_path = os.path.join(self.args.ocr_output_folder, log_file) try: with open(log_file_full_path, "r", encoding="utf-16-le", errors="ignore") as inp: for line in inp: if line.find('Error:') != -1: m = re.search('[a-f0-9]{64}.pdf', line) if m is not None: file_path = os.path.join( self.args.ocr_input_folder, m.group(0)) broken_files.append(file_path) if line.find('Pages processed') != -1: log_is_completed = True except Exception as exp: self.logger.error("fail to read \"{}\", exception: {}".format( log_file, exp)) continue if not log_is_completed: self.logger.debug( "skip incomplete log_file \"{}\"".format(log_file)) continue self.logger.debug( "process log_file \"{}\" with {} broken files".format( log_file, len(broken_files))) try: shutil.move( log_file_full_path, os.path.join(self.args.ocr_logs_folder, log_file + "." + str(time.time()))) except Exception as exp: self.logger.error("exception: {}".format(exp)) for filename in broken_files: if os.path.exists(filename): if not TConvertStorage.is_normal_input_file_name(filename): self.convert_storage.delete_file_silently(filename) else: sha256 = TConvertStorage.get_sha256_from_filename( filename) self.register_ocr_process_finish( self.ocr_tasks.get(sha256), False) self.convert_storage.save_converted_file_broken_stub( sha256, True) self.logger.debug( "remove {}, since ocr cannot process it (\"{}\")". format(filename, log_file)) self.convert_storage.delete_file_silently(filename) def try_convert_with_winword(self): files_count = 0 while not self.input_task_queue.empty(): task = self.input_task_queue.get() if not self.http_server_is_working: return try: self.process_one_input_file(task) files_count += 1 if files_count >= 80: break # just give a chance to accomplish other tasks, then return to these tasks except Exception as exp: self.logger.error("Exception: {}".format(exp)) if os.path.exists(task.file_path): self.logger.error("delete {}".format(task.file_path)) os.unlink(task.file_path) def process_docx_from_ocr(self): new_files_in_db = False for docx_file in os.listdir(self.args.ocr_output_folder): if not docx_file.endswith(".docx"): continue docx_file = os.path.join(self.args.ocr_output_folder, docx_file) input_task = self.ocr_tasks.get( TConvertStorage.get_sha256_from_filename(docx_file)) if input_task is None: self.logger.debug( "remove a converted file from unknown sources ".format( docx_file)) self.convert_storage.delete_file_silently(docx_file) else: for try_index in [1, 2, 3]: self.logger.info( "got file {} from ocr try to move it, trial No {}". format(docx_file, try_index)) try: self.convert_storage.save_converted_file( docx_file, input_task.sha256, "ocr", input_task.force) self.register_ocr_process_finish(input_task, True) new_files_in_db = True break except Exception as exp: # under windows it should raise an exception if ocr is still writing to this file self.logger.error( "Exception {}, sleep 60 seconds ...".format( str(exp))) time.sleep(60) # delete tmp stripped pdf file, the input file is in storage self.convert_storage.delete_file_silently( os.path.join(self.args.ocr_input_folder, input_task.sha256 + ".pdf")) if os.path.exists(docx_file): self.logger.debug( "cannot access {} in 3 tries, remove it".format( docx_file)) self.register_ocr_process_finish(input_task, False) self.convert_storage.delete_file_silently(docx_file) return new_files_in_db def get_stats(self): try: ocr_pending_all_file_size = sum(x.file_size for x in self.ocr_tasks.values()) input_task_queue = self.input_task_queue.qsize() ocr_tasks_count = len(self.ocr_tasks) return { 'all_put_files_count': self.all_put_files_count, 'successful_get_requests_count': self.successful_get_requests, # normally input_task_queue == input_folder_files_count 'input_task_queue': input_task_queue, 'input_folder_files_count': len(os.listdir(self.args.input_folder)), # normally ocr_pending_files_count == ocr_tasks_count 'ocr_pending_files_count': len(os.listdir(self.args.ocr_input_folder)), 'ocr_tasks_count': ocr_tasks_count, 'ocr_pending_all_file_size': ocr_pending_all_file_size, 'is_converting': input_task_queue > 0 or ocr_tasks_count > 0, 'processed_files_size': self.processed_files_size, 'failed_files_size': self.failed_files_size, 'finished_ocr_tasks': self.finished_ocr_tasks, 'snow_ball_os_error_count': self.convert_storage.snow_ball_os_error_count, "pause_service_actions": self.pause_service_actions(), } except Exception as exp: return {"exception": str(exp)} def process_stalled_files(self): current_time = time.time() for pdf_file in os.listdir(self.args.ocr_input_folder): fpath = os.path.join(self.args.ocr_input_folder, pdf_file) timestamp = Path(fpath).stat().st_mtime if current_time - timestamp > self.args.ocr_timeout: self.logger.error( "delete orphan file {} after stalling {} seconds".format( fpath, self.args.ocr_timeout)) self.convert_storage.delete_file_silently(fpath) sha256 = TConvertStorage.get_sha256_from_filename(pdf_file) self.register_ocr_process_finish(self.ocr_tasks.get(sha256), False) def restart_ocr(self): self.logger.debug("restart ocr") self.logger.debug("taskkill HotFolder.exe") taskkill_windows('HotFolder.exe') self.logger.debug("taskkill fineexec.exe") taskkill_windows('FineExec.exe') self.logger.debug("start HotFolder.exe") creationflags = subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP | \ subprocess.CREATE_BREAKAWAY_FROM_JOB | subprocess.SW_HIDE subprocess.Popen([self.hot_folder_path], creationflags=creationflags, stdin=subprocess.DEVNULL, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, cwd="c:/") def process_all_tasks(self): if len(self.ocr_tasks) == 0: self.ocr_queue_is_empty_last_time_stamp = time.time() self.try_convert_with_winword() new_files_from_ocr = self.process_docx_from_ocr() if new_files_from_ocr: self.got_ocred_file_last_time_stamp = time.time() # file garbage tasks current_time = time.time() if current_time - self.file_garbage_collection_timestamp >= 60: # just not too often self.file_garbage_collection_timestamp = current_time if not self.http_server_is_working: return self.process_ocr_logs() if not self.http_server_is_working: return self.process_stalled_files() if current_time - self.self_server_ping_timestamp >= 3600: # just not too often args = TDocConversionClient.parse_args( ["--server-address", self.args.server_address]) client = TDocConversionClient(args, self.logger) if not client.assert_declarator_conv_alive(raise_exception=False): self.logger.error("cannot ping itself, exit") self.stop_http_server(run_shutdown=False) sys.exit(1) self.self_server_ping_timestamp = current_time current_time = time.time() if current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \ current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time : self.logger.debug( "last ocr file was received long ago and all this time the ocr queue was not empty" ) if not self.http_server_is_working: return self.restart_ocr() self.got_ocred_file_last_time_stamp = time.time( ) #otherwize restart will be too often def pause_service_actions(self): return os.path.exists(self.pause_service_actions_file_path) def service_actions_in_a_thread(self): last_heart_beat = time.time() while self.http_server_is_working: if time.time() - last_heart_beat >= self.args.central_heart_rate: if not self.pause_service_actions(): try: self.process_all_tasks() except Exception as exp: if self.logger is not None: self.logger.error(exp) self.stop_http_server(run_shutdown=False) sys.exit(1) last_heart_beat = time.time() else: time.sleep(1)