def __init__(self, args, logger): self.timeout = 60 * 10 self.conversion_client = TDocConversionClient(logger) self.args = args self.logger = logger self.dlrobot_remote_calls = defaultdict(list) self.input_files = list() self.worker_2_running_tasks = defaultdict(list) self.initialize_tasks() self.cloud_id_to_worker_ip = dict() host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.smart_parser_cache_client = None if self.args.enable_smart_parser: self.smart_parser_cache_client = TSmartParserCacheClient( self.logger) self.crawl_epoch_id = self.args.crawl_epoch_id self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size( )
def get_text_of_a_document(source_file, keep_txt=False, reuse_txt=False, output_folder=None): global EXTERNAl_CONVERTORS ec = EXTERNAl_CONVERTORS _, file_extension = os.path.splitext(source_file) file_extension = file_extension.lower() if output_folder is None: txt_file = source_file + ".txt" else: txt_file = os.path.join(output_folder, os.path.basename(source_file) + ".txt") if reuse_txt and os.path.exists(txt_file): pass elif file_extension == ".xlsx": ec.run_xlsx2csv(source_file, txt_file) elif file_extension == ".xls": res = ec.run_xls2csv(source_file, txt_file) if res != 0: temp_fname = source_file + ".xlsx" shutil.copy(source_file, temp_fname) ec.run_xlsx2csv(temp_fname, txt_file) os.unlink(temp_fname) elif file_extension == ".docx": ec.run_office2txt(source_file, txt_file) elif file_extension == ".pdf": temp_file = source_file + ".docx" sha256 = build_dislosures_sha256(source_file) if TDocConversionClient( TDocConversionClient.parse_args([])).retrieve_document( sha256, temp_file) and os.path.exists(temp_file): ec.run_office2txt(temp_file, txt_file) else: # the worse case, let's use calibre ec.run_calibre(source_file, txt_file) if os.path.exists(temp_file): os.unlink(temp_file) elif file_extension in {".html", ".rtf", ".htm"}: ec.run_calibre(source_file, txt_file) elif file_extension == ".doc": res = ec.run_catdoc(source_file, txt_file) if res != 0: temp_fname = source_file + ".docx" shutil.copy(source_file, temp_fname) ec.run_office2txt(temp_fname, txt_file) os.unlink(temp_fname) else: return None if os.path.exists(txt_file): doc_text = read_input_text(txt_file) if not keep_txt: os.unlink(txt_file) return doc_text else: return None
def __init__(self, args): self.logger = setup_logging(log_file_name="export_human_files.log") self.args = args if self.args.tmp_folder is None: self.args.tmp_folder = tempfile.mkdtemp("export_human") self.logger.debug("create folder {}".format(self.args.tmp_folder)) else: self.logger.debug("rm folder {}".format(self.args.tmp_folder)) shutil.rmtree(self.args.tmp_folder, ignore_errors=True) os.mkdir(self.args.tmp_folder) self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger) self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger) self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger) self.new_pdfs = set()
def __init__(self, args): self.register_task_result_error_count = 0 self.logger = setup_logging(log_file_name=args.log_file_name, append_mode=True) self.conversion_client = TDocConversionClient( TDocConversionClient.parse_args([]), self.logger) self.args = args rounds = TDeclarationRounds(args.round_file) self.dlrobot_remote_calls = TRemoteDlrobotCallList( logger=self.logger, file_name=args.remote_calls_file, min_start_time_stamp=rounds.start_time_stamp) self.worker_2_running_tasks = defaultdict(list) self.worker_2_continuous_failures_count = defaultdict(int) offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.offices_file) self.web_sites_db = TDeclarationWebSiteList(self.logger, offices=offices) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) self.web_sites_to_process = self.find_projects_to_process() self.cloud_id_to_worker_ip = dict() self.config = TRobotConfig.read_by_config_type( self.args.dlrobot_config_type) self.last_remote_call = None # for testing host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.service_action_count = 0 self.decl_sender = TDeclarationSender( self.logger, self.args.enable_smart_parser, self.args.enable_source_doc_server) self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.logger.debug("init complete") self.send_to_telegram("start dlrobot central with {} tasks".format( len(self.web_sites_to_process)))
def main(args, logger): if args.conversion_server is not None: TDocConversionClient.DECLARATOR_CONV_URL = args.conversion_server conv_tasks = TDocConversionClient(logger) conv_tasks.start_conversion_thread() try: sent_files = send_files(args, logger, conv_tasks) if args.receive_files and len(sent_files) > 0: conv_tasks.wait_doc_conversion_finished(args.conversion_timeout) else: logger.debug("stop conversion finished") conv_tasks.stop_conversion_thread() except Exception as exp: logger.error("exception: {}, stop_conversion_thread".format(exp)) conv_tasks.stop_conversion_thread() if args.receive_files: if not receive_files(logger, conv_tasks, sent_files): return 1 return 0
def process_all_tasks(self): if len(self.ocr_tasks) == 0: self.ocr_queue_is_empty_last_time_stamp = time.time() self.try_convert_with_winword() new_files_from_ocr = self.process_docx_from_ocr() if new_files_from_ocr: self.got_ocred_file_last_time_stamp = time.time() # file garbage tasks current_time = time.time() if current_time - self.file_garbage_collection_timestamp >= 60: # just not too often self.file_garbage_collection_timestamp = current_time if not self.http_server_is_working: return self.process_ocr_logs() if not self.http_server_is_working: return self.process_stalled_files() if current_time - self.self_server_ping_timestamp >= 3600: # just not too often args = TDocConversionClient.parse_args( ["--server-address", self.args.server_address]) client = TDocConversionClient(args, self.logger) if not client.assert_declarator_conv_alive(raise_exception=False): self.logger.error("cannot ping itself, exit") self.stop_http_server(run_shutdown=False) sys.exit(1) self.self_server_ping_timestamp = current_time current_time = time.time() if current_time - self.got_ocred_file_last_time_stamp > self.args.ocr_restart_time and \ current_time - self.ocr_queue_is_empty_last_time_stamp > self.args.ocr_restart_time : self.logger.debug( "last ocr file was received long ago and all this time the ocr queue was not empty" ) if not self.http_server_is_working: return self.restart_ocr() self.got_ocred_file_last_time_stamp = time.time( ) #otherwize restart will be too often
def process_with_client(self, input_files, timeout=None, rebuild=False, skip_receiving=False, log_name="client", input_task_timeout=5): output_files = list(os.path.basename(i) + ".docx" for i in input_files) for o in output_files: if os.path.exists(o): os.unlink(o) client_args = [ "--server-address", self.server_address, "--conversion-timeout", "180", "--output-folder", ".", ] + input_files if timeout is not None: client_args.extend(['--conversion-timeout', str(timeout)]) if rebuild: client_args.append('--rebuild') if skip_receiving: client_args.append('--skip-receiving') if self.client_count >= 0 and log_name == "client": log_name = log_name + str(self.client_count) logger = setup_logging(logger_name=log_name) try: self.client_count += 1 self.client = TDocConversionClient( TDocConversionClient.parse_args(client_args), logger=logger) self.client.input_task_timeout = input_task_timeout self.client.start_conversion_thread() self.client.process_files() return output_files finally: close_logger(logger)
def get_text_of_a_document(source_file, keep_txt=False, reuse_txt=False): global EXTERNAl_CONVERTORS ec = EXTERNAl_CONVERTORS _, file_extension = os.path.splitext(source_file) file_extension = file_extension.lower() txt_file = source_file + ".txt" if reuse_txt and os.path.exists(txt_file): pass elif file_extension == ".xlsx": ec.run_xlsx2csv(source_file, txt_file) elif file_extension == ".xls": res = ec.run_xls2csv(source_file, txt_file) if res != 0: temp_fname = source_file + ".xlsx" shutil.copy(source_file, temp_fname) ec.run_xlsx2csv(temp_fname, txt_file) os.unlink(temp_fname) elif file_extension == ".docx": ec.run_office2txt(source_file, txt_file) elif file_extension == ".pdf": temp_file = source_file + ".docx" with open(source_file, "rb") as f: sha256 = hashlib.sha256(f.read()).hexdigest() if TDocConversionClient().retrieve_document(sha256, temp_file): ec.run_office2txt(temp_file, txt_file) else: # the worse case, let's use calibre ec.run_calibre(source_file, txt_file) if os.path.exists(temp_file): os.unlink(temp_file) elif file_extension in {".html", ".rtf", ".htm"}: ec.run_calibre(source_file, txt_file) elif file_extension == ".doc": res = ec.run_catdoc(source_file, txt_file) if res != 0: temp_fname = source_file + ".docx" shutil.copy(source_file, temp_fname) ec.run_office2txt(temp_fname, txt_file) os.unlink(temp_fname) else: return None if os.path.exists(txt_file): doc_text = read_input_text(txt_file) if not keep_txt: os.unlink(txt_file) return doc_text else: return None
import json from ConvStorage.conversion_client import TDocConversionClient import argparse import os import time def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--history-file", dest='history_file', default=None) return parser.parse_args() if __name__ == '__main__': args = parse_args() conv_client = TDocConversionClient(TDocConversionClient.parse_args([]), logging) stats = conv_client.get_stats() if args.history_file is None: print(json.dumps(stats)) else: lines = list() if os.path.exists(args.history_file): with open(args.history_file, "r", encoding="utf-8") as inp: for l in inp: lines.append(l) lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats))) lines = lines[-400:] with open(args.history_file, "w", encoding="utf-8") as out: for l in lines: out.write(l)
from ConvStorage.conversion_client import TDocConversionClient from common.logging_wrapper import setup_logging import sys if __name__ == '__main__': logger = setup_logging(log_file_name="convert_pdf.log") client = TDocConversionClient( TDocConversionClient.parse_args(sys.argv[1:]), logger) client.start_conversion_thread() exit_code = client.process_files() sys.exit(exit_code)
import json from ConvStorage.conversion_client import TDocConversionClient import argparse import os import time def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--history-file", dest='history_file', default=None) return parser.parse_args() if __name__ == '__main__': args = parse_args() conv_client = TDocConversionClient(logging) stats = conv_client.get_stats() if args.history_file is None: print(json.dumps(stats)) else: lines = list() if os.path.exists(args.history_file): with open(args.history_file, "r", encoding="utf-8") as inp: for l in inp: lines.append(l) lines.append("{}\t{}\n".format(int(time.time()), json.dumps(stats))) lines = lines[-400:] with open(args.history_file, "w", encoding="utf-8") as out: for l in lines: out.write(l)
class TTestConvBase(TestCase): def __init__(self, methodName='runTest'): super().__init__(methodName) self.port = 8081 self.name = None self.data_folder = None self.server_address = "localhost:{}".format(self.port) self.server = None self.server_thread = None self.server_process = None self.client = None self.converters = TExternalConverters(enable_smart_parser=False, enable_calibre=False, enable_cat_doc=False, enable_xls2csv=False, enable_office_2_txt=False) self.pdf_ocr_folder = os.path.join(os.path.dirname(__file__), "pdf.ocr") self.pdf_ocr_out_folder = os.path.join(os.path.dirname(__file__), "pdf.ocr.out") if not os.path.exists(self.pdf_ocr_folder) or not os.path.exists( self.pdf_ocr_out_folder): raise Exception( "run python update_finereader_task.py, and upload test.hft to finreader hot folder" ) self.project_file = "converted_file_storage.json" self.client = None self.server_args = None self.client_count = 0 def start_server_thread(self): self.server = TConvertProcessor( TConvertProcessor.parse_args(self.server_args)) self.server_thread = threading.Thread(target=start_server, args=(self.server, )) self.server_thread.start() def setup_server(self, name, addit_server_args=list(), start_process=False): self.name = name self.data_folder = os.path.join(os.path.dirname(__file__), "data.{}".format(name)) recreate_folder(self.data_folder) os.chdir(self.data_folder) input_files = "input_files" recreate_folder(input_files) db_converted_files = os.path.join(self.data_folder, "db_converted_files") recreate_folder(db_converted_files) db_input_files = os.path.join(self.data_folder, "db_input_files") recreate_folder(db_input_files) log_file = "db_conv.log" if os.path.exists(log_file): os.unlink(log_file) clear_folder_with_retry(self.pdf_ocr_folder) clear_folder_with_retry(self.pdf_ocr_out_folder) TConvertStorage.create_empty_db(db_input_files, db_converted_files, self.project_file) self.server_args = [ "--server-address", self.server_address, '--logfile', log_file, '--db-json', self.project_file, '--disable-killing-winword', '--ocr-input-folder', self.pdf_ocr_folder, '--ocr-output-folder', self.pdf_ocr_out_folder, '--disable-telegram' ] + addit_server_args if start_process: server_script = os.path.join(os.path.dirname(__file__), "..", "conv_storage_server.py") args = ["python", server_script] + self.server_args self.server_process = subprocess.Popen(args, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) else: self.start_server_thread() def restart_server(self): self.server.stop_http_server() self.server_thread.join(0) self.start_server_thread() def process_with_client(self, input_files, timeout=None, rebuild=False, skip_receiving=False, log_name="client", input_task_timeout=5): output_files = list(os.path.basename(i) + ".docx" for i in input_files) for o in output_files: if os.path.exists(o): os.unlink(o) client_args = [ "--server-address", self.server_address, "--conversion-timeout", "180", "--output-folder", ".", ] + input_files if timeout is not None: client_args.extend(['--conversion-timeout', str(timeout)]) if rebuild: client_args.append('--rebuild') if skip_receiving: client_args.append('--skip-receiving') if self.client_count >= 0 and log_name == "client": log_name = log_name + str(self.client_count) logger = setup_logging(logger_name=log_name) try: self.client_count += 1 self.client = TDocConversionClient( TDocConversionClient.parse_args(client_args), logger=logger) self.client.input_task_timeout = input_task_timeout self.client.start_conversion_thread() self.client.process_files() return output_files finally: close_logger(logger) def list2reason(self, exc_list): if exc_list and exc_list[-1][0] is self: return exc_list[-1][1] def tear_down(self): result = self.defaultTestResult() self._feedErrorsToResult(result, self._outcome.errors) error = self.list2reason(result.errors) failure = self.list2reason(result.failures) delete_temp_files = not error and not failure if self.client is not None: self.client.stop_conversion_thread(1) self.client = None if self.server is not None: self.server.stop_http_server() self.server_thread.join(0) self.server = None else: self.server_process.kill() self.server_process = None time.sleep(5) os.chdir(os.path.dirname(__file__))
class TDlrobotHTTPServer(http.server.HTTPServer): def initialize_tasks(self): self.dlrobot_remote_calls.clear() self.worker_2_running_tasks.clear() self.input_files = list(x for x in os.listdir(self.args.input_folder) if x.endswith('.txt')) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) if args.read_previous_results: self.read_prev_dlrobot_remote_calls() logger.debug("there are {} dlrobot projects to process".format( len(self.input_files))) self.worker_2_running_tasks.clear() def __init__(self, args, logger): self.timeout = 60 * 10 self.conversion_client = TDocConversionClient(logger) self.args = args self.logger = logger self.dlrobot_remote_calls = defaultdict(list) self.input_files = list() self.worker_2_running_tasks = defaultdict(list) self.initialize_tasks() self.cloud_id_to_worker_ip = dict() host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.smart_parser_cache_client = None if self.args.enable_smart_parser: self.smart_parser_cache_client = TSmartParserCacheClient( self.logger) self.crawl_epoch_id = self.args.crawl_epoch_id self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size( ) def verify_request(self, request, client_address): if self.args.enable_ip_checking: (ip, dummy) = client_address if ip not in self.permitted_hosts: return False return True def log_process_result(self, process_result): s = process_result.stdout.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) s = process_result.stderr.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) def get_dlrobot_remote_calls_filename(self): return os.path.join(self.args.result_folder, "dlrobot_remote_calls.dat") def have_tasks(self): return len(self.input_files) > 0 and not self.stop_process def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall): with open(self.get_dlrobot_remote_calls_filename(), "a") as outp: outp.write(json.dumps(remote_call.write_to_json()) + "\n") self.dlrobot_remote_calls[remote_call.project_file].append(remote_call) if remote_call.exit_code != 0: max_tries_count = args.tries_count tries_count = len( self.dlrobot_remote_calls[remote_call.project_file]) if remote_call.project_folder is None and tries_count == max_tries_count: # if the last result was not obtained, may be, # worker is down, so the problem is not in the task but in the worker # so give this task one more chance max_tries_count += 1 self.logger.debug( "increase max_tries_count for {} to {}".format( remote_call.project_file, max_tries_count)) if tries_count < max_tries_count: self.input_files.append(remote_call.project_file) self.logger.debug("register retry for {}".format( remote_call.project_file)) def input_tasks_exist(self): with os.scandir(self.args.input_folder) as it: for entry in it: if entry.name.endswith(".txt"): return True return False def can_start_new_epoch(self): if self.stop_process: return False if not self.input_tasks_exist(): return False if self.get_running_jobs_count() > 0: return False return True def start_new_epoch(self): archive_filename = "{}.{}".format( self.get_dlrobot_remote_calls_filename(), self.crawl_epoch_id) if os.path.exists(archive_filename): self.logger.error("cannot create file {}, already exists".format( archive_filename)) raise Exception("bad crawl epoch id") shutil.move(self.get_dlrobot_remote_calls_filename(), archive_filename) self.crawl_epoch_id += 1 self.logger.error("start new epoch {}".format(self.crawl_epoch_id)) self.initialize_tasks() def read_prev_dlrobot_remote_calls(self): if os.path.exists(self.get_dlrobot_remote_calls_filename()): self.logger.debug("read {}".format( self.get_dlrobot_remote_calls_filename())) calls = TRemoteDlrobotCall.read_remote_calls_from_file( self.get_dlrobot_remote_calls_filename()) for remote_call in calls: self.dlrobot_remote_calls[remote_call.project_file].append( remote_call) if remote_call.exit_code == 0 and remote_call.project_file in self.input_files: self.logger.debug( "delete {}, since it is already processed".format( remote_call.project_file)) self.input_files.remove(remote_call.project_file) def get_running_jobs_count(self): return sum(len(w) for w in self.worker_2_running_tasks.values()) def get_processed_jobs_count(self): return sum(len(w) for w in self.dlrobot_remote_calls.values()) def conversion_server_queue_is_short(self): return self.pdf_conversion_queue_length < self.args.pdf_conversion_queue_limit def get_new_job_task(self, worker_host_name, worker_ip): project_file = self.input_files.pop(0) self.logger.info( "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}" .format(project_file, worker_ip, worker_host_name, len(self.input_files), self.get_running_jobs_count())) res = TRemoteDlrobotCall(worker_ip, project_file) res.worker_host_name = worker_host_name self.worker_2_running_tasks[worker_ip].append(res) return project_file def untar_file(self, project_file, result_archive): base_folder, _ = os.path.splitext(project_file) output_folder = os.path.join( args.result_folder, base_folder) + ".{}".format(int(time.time())) compressed_file = io.BytesIO(result_archive) decompressed_file = gzip.GzipFile(fileobj=compressed_file) tar = tarfile.open(fileobj=decompressed_file) tar.extractall(output_folder) return output_folder def pop_project_from_running_tasks(self, worker_ip, project_file): if worker_ip not in self.worker_2_running_tasks: raise Exception( "{} is missing in the worker table".format(worker_ip)) worker_running_tasks = self.worker_2_running_tasks[worker_ip] for i in range(len(worker_running_tasks)): if worker_running_tasks[i].project_file == project_file: return worker_running_tasks.pop(i) raise Exception("{} is missing in the worker {} task table".format( project_file, worker_ip)) def send_declaraion_files_to_smart_parser(self, dlrobot_project_folder): doc_folder = os.path.join(dlrobot_project_folder, "result") if os.path.exists(doc_folder): for website in os.listdir(doc_folder): website_folder = os.path.join(doc_folder, website) for doc in os.listdir(website_folder): _, extension = os.path.splitext(doc) if extension in ACCEPTED_DOCUMENT_EXTENSIONS: self.smart_parser_cache_client.send_file( os.path.join(website_folder, doc)) def register_task_result(self, worker_host_name, worker_ip, project_file, exit_code, result_archive): if args.skip_worker_check: remote_call = TRemoteDlrobotCall(worker_ip, project_file) else: remote_call = self.pop_project_from_running_tasks( worker_ip, project_file) remote_call.worker_host_name = worker_host_name remote_call.exit_code = exit_code remote_call.end_time = int(time.time()) remote_call.project_folder = self.untar_file(project_file, result_archive) remote_call.calc_project_stats() if self.args.enable_smart_parser: self.send_declaraion_files_to_smart_parser( remote_call.project_folder) self.save_dlrobot_remote_call(remote_call) self.logger.debug( "got exitcode {} for task result {} from worker {}".format( exit_code, project_file, worker_ip)) def forget_old_remote_processes(self, current_time): for running_procs in self.worker_2_running_tasks.values(): for i in range(len(running_procs) - 1, -1, -1): rc = running_procs[i] if current_time - rc.start_time > args.dlrobot_project_timeout: self.logger.debug( "task {} on worker {} takes {} seconds, probably it failed, stop waiting for a result" .format(rc.project_file, rc.worker_ip, current_time - rc.start_time)) running_procs.pop(i) rc.exit_code = 126 self.save_dlrobot_remote_call(rc) def forget_remote_processes_for_yandex_worker(self, cloud_id, current_time): worker_ip = self.cloud_id_to_worker_ip.get(cloud_id) if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0: self.logger.info( "I do not remember ip for cloud_id {}, cannot delete processes" .format(cloud_id)) return running_procs = self.worker_2_running_tasks.get(worker_ip, list()) for i in range(len(running_procs) - 1, -1, -1): rc = running_procs[i] self.logger.debug( "forget task {} on worker {} since the workstation was stopped" .format(rc.project_file, rc.worker_ip)) running_procs.pop(i) rc.exit_code = 125 self.save_dlrobot_remote_call(rc) if cloud_id in self.cloud_id_to_worker_ip: del self.cloud_id_to_worker_ip[cloud_id] def check_yandex_cloud(self): if not self.args.check_yandex_cloud: return None try: if not check_internet(): self.logger.error( "cannot connect to google dns, probably internet is down") return None current_time = time.time() for m in TYandexCloud.list_instances(): cloud_id = m['id'] if m['status'] == 'STOPPED': self.forget_remote_processes_for_yandex_worker( cloud_id, current_time) self.logger.info( "start yandex cloud worker {}".format(cloud_id)) TYandexCloud.start_yandex_cloud_worker(cloud_id) elif m['status'] == "RUNNING": worker_ip = TYandexCloud.get_worker_ip(m) if self.args.enable_ip_checking: self.permitted_hosts.add(worker_ip) self.cloud_id_to_worker_ip[cloud_id] = worker_ip except Exception as exp: self.logger.error(exp) def service_actions(self): current_time = time.time() if current_time - self.last_service_action_time_stamp >= args.central_heart_rate: self.last_service_action_time_stamp = current_time self.forget_old_remote_processes(current_time) self.check_yandex_cloud() if os.path.exists(PITSTOP_FILE): self.stop_process = True self.logger.debug("stop sending tasks, exit for a pit stop") os.unlink(PITSTOP_FILE) if self.stop_process and self.get_running_jobs_count() == 0: raise Exception("exit for pit stop") self.pdf_conversion_queue_length = self.conversion_client.get_pending_all_file_size( ) if not self.conversion_server_queue_is_short(): self.logger.debug( "stop sending tasks, because conversion pdf queue length is {}" .format(self.pdf_conversion_queue_length)) def get_stats(self): workers = dict((k, list(r.write_to_json() for r in v)) for (k, v) in self.worker_2_running_tasks.items()) return { 'running_count': self.get_running_jobs_count(), 'input_tasks': len(self.input_files), 'processed_tasks': self.get_processed_jobs_count(), 'worker_2_running_tasks': workers }
class TDlrobotHTTPServer(http.server.HTTPServer): max_continuous_failures_count = 7 PITSTOP_FILE = ".dlrobot_pit_stop" @staticmethod def parse_args(arg_list): parser = argparse.ArgumentParser() parser.add_argument( "--server-address", dest='server_address', default=None, help= "by default read it from environment variable DLROBOT_CENTRAL_SERVER_ADDRESS" ) parser.add_argument("--dlrobot-config-type", dest='dlrobot_config_type', required=False, default="prod", help="can be prod, preliminary or test") parser.add_argument("--custom-offices-file", dest='offices_file', required=False) parser.add_argument("--log-file-name", dest='log_file_name', required=False, default="dlrobot_central.log") parser.add_argument("--remote-calls-file", dest='remote_calls_file', default=None) parser.add_argument("--result-folder", dest='result_folder', required=True) parser.add_argument("--tries-count", dest='tries_count', required=False, default=2, type=int) parser.add_argument("--central-heart-rate", dest='central_heart_rate', required=False, default='60s') parser.add_argument( "--check-yandex-cloud", dest='check_yandex_cloud', default=False, action='store_true', required=False, help="check yandex cloud health and restart workstations") parser.add_argument( "--skip-worker-check", dest='skip_worker_check', default=False, action='store_true', required=False, help="skip checking that this task was given to this worker") parser.add_argument("--enable-ip-checking", dest='enable_ip_checking', default=False, action='store_true', required=False) parser.add_argument("--disable-smart-parser-server", dest="enable_smart_parser", default=True, action="store_false", required=False) parser.add_argument("--disable-source-doc-server", dest="enable_source_doc_server", default=True, action="store_false", required=False) parser.add_argument("--disable-search-engines", dest="enable_search_engines", default=True, action="store_false", required=False) parser.add_argument("--disable-telegram", dest="enable_telegram", default=True, required=False, action="store_false") parser.add_argument("--disable-pdf-conversion-server-checking", dest="pdf_conversion_server_checking", default=True, required=False, action="store_false") parser.add_argument("--web-site-regexp", dest="web_site_regexp", required=False) parser.add_argument("--office-source-id", dest="office_source_id", required=False) parser.add_argument( "--round-file", dest="round_file", default=TDeclarationRounds.default_dlrobot_round_path) args = parser.parse_args(arg_list) args.central_heart_rate = convert_timeout_to_seconds( args.central_heart_rate) if args.server_address is None: args.server_address = os.environ['DLROBOT_CENTRAL_SERVER_ADDRESS'] if args.check_yandex_cloud: assert TYandexCloud.get_yc() is not None return args def __init__(self, args): self.register_task_result_error_count = 0 self.logger = setup_logging(log_file_name=args.log_file_name, append_mode=True) self.conversion_client = TDocConversionClient( TDocConversionClient.parse_args([]), self.logger) self.args = args rounds = TDeclarationRounds(args.round_file) self.dlrobot_remote_calls = TRemoteDlrobotCallList( logger=self.logger, file_name=args.remote_calls_file, min_start_time_stamp=rounds.start_time_stamp) self.worker_2_running_tasks = defaultdict(list) self.worker_2_continuous_failures_count = defaultdict(int) offices = TOfficeTableInMemory() offices.read_from_local_file(self.args.offices_file) self.web_sites_db = TDeclarationWebSiteList(self.logger, offices=offices) if not os.path.exists(self.args.result_folder): os.makedirs(self.args.result_folder) self.web_sites_to_process = self.find_projects_to_process() self.cloud_id_to_worker_ip = dict() self.config = TRobotConfig.read_by_config_type( self.args.dlrobot_config_type) self.last_remote_call = None # for testing host, port = self.args.server_address.split(":") self.logger.debug("start server on {}:{}".format(host, port)) super().__init__((host, int(port)), TDlrobotRequestHandler) self.last_service_action_time_stamp = time.time() self.service_action_count = 0 self.decl_sender = TDeclarationSender( self.logger, self.args.enable_smart_parser, self.args.enable_source_doc_server) self.stop_process = False if self.args.enable_ip_checking: self.permitted_hosts = set( str(x) for x in ipaddress.ip_network('192.168.100.0/24').hosts()) self.permitted_hosts.add('127.0.0.1') self.permitted_hosts.add('95.165.96.61') # disclosures.ru self.logger.debug("init complete") self.send_to_telegram("start dlrobot central with {} tasks".format( len(self.web_sites_to_process))) def send_to_telegram(self, message): if self.args.enable_telegram: self.logger.debug("send to telegram: {}".format(message)) telegram_send.send(messages=[message]) def stop_server(self): self.server_close() self.shutdown() def verify_request(self, request, client_address): if self.args.enable_ip_checking: (ip, dummy) = client_address if ip not in self.permitted_hosts: return False return True def log_process_result(self, process_result): s = process_result.stdout.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) s = process_result.stderr.strip("\n\r ") if len(s) > 0: for line in s.split("\n"): self.logger.error("task stderr: {}".format(line)) def have_tasks(self): return len(self.web_sites_to_process) > 0 and not self.stop_process def project_is_to_process(self, project_file): interactions = self.dlrobot_remote_calls.get_interactions(project_file) if sum(1 for i in interactions if i.task_was_successful()) > 0: return False tries_count = self.args.tries_count if sum(1 for i in interactions if not i.task_ended()) > 0: # if the last result was not obtained, may be, # worker is down, so the problem is not in the task but in the worker # so give this task one more chance tries_count += 1 self.logger.debug("increase max_tries_count for {} to {}".format( project_file, tries_count)) return len(interactions) < tries_count def save_dlrobot_remote_call(self, remote_call: TRemoteDlrobotCall): self.dlrobot_remote_calls.add_dlrobot_remote_call(remote_call) if not remote_call.task_was_successful(): if self.project_is_to_process(remote_call.project_file): self.web_sites_to_process.append(remote_call.web_site) self.logger.debug("register retry for {}".format( remote_call.web_site)) def find_projects_to_process(self): web_sites_to_process = list() self.logger.info("filter web sites") web_site_info: TDeclarationWebSite for web_site, web_site_info in self.web_sites_db.web_sites.items(): if self.args.web_site_regexp is not None: if re.match(self.args.web_site_regexp, web_site) is None: continue if self.args.office_source_id is not None: if web_site_info.get_parent_source_id( ) != self.args.office_source_id: continue if TWebSiteReachStatus.can_communicate(web_site_info.reach_status): project_file = TRemoteDlrobotCall.web_site_to_project_file( web_site) if self.project_is_to_process(project_file): web_sites_to_process.append(web_site) self.logger.info("there are {} sites in the input queue".format( len(web_sites_to_process))) web_sites_to_process.sort( key=(lambda x: self.dlrobot_remote_calls.last_interaction[x])) with open("web_sites_to_process_debug.txt", "w") as out: for w in web_sites_to_process: out.write(w + "\n") return web_sites_to_process def get_running_jobs_count(self): return sum(len(w) for w in self.worker_2_running_tasks.values()) def get_processed_jobs_count(self): return len(list(self.dlrobot_remote_calls.get_all_calls())) def get_new_project_to_process(self, worker_host_name, worker_ip): site_url = self.web_sites_to_process.pop(0) project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url) self.logger.info( "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}" .format(project_file, worker_ip, worker_host_name, len(self.web_sites_to_process), self.get_running_jobs_count())) remote_call = TRemoteDlrobotCall(worker_ip=worker_ip, project_file=project_file, web_site=site_url) remote_call.worker_host_name = worker_host_name web_site_passport = self.web_sites_db.get_web_site(site_url) regional_main_pages = list() if web_site_passport is None: self.logger.error( "{} is not registered in the web site db, no office information is available for the site" ) project_content_str = TRobotProject.create_project_str( site_url, regional_main_pages, disable_search_engine=not self.args.enable_search_engines) self.worker_2_running_tasks[worker_ip].append(remote_call) return remote_call, project_content_str.encode("utf8") def untar_file(self, project_file, result_archive): base_folder, _ = os.path.splitext(project_file) output_folder = os.path.join(self.args.result_folder, base_folder) + ".{}".format( int(time.time())) compressed_file = io.BytesIO(result_archive) decompressed_file = gzip.GzipFile(fileobj=compressed_file) tar = tarfile.open(fileobj=decompressed_file) tar.extractall(output_folder) return output_folder def pop_project_from_running_tasks(self, worker_ip, project_file): if worker_ip not in self.worker_2_running_tasks: raise Exception( "{} is missing in the worker table".format(worker_ip)) worker_running_tasks = self.worker_2_running_tasks[worker_ip] for i in range(len(worker_running_tasks)): if worker_running_tasks[i].project_file == project_file: return worker_running_tasks.pop(i) raise Exception("{} is missing in the worker {} task table".format( project_file, worker_ip)) def worker_is_banned(self, worker_ip, host_name): return self.worker_2_continuous_failures_count[(worker_ip, host_name)] > \ TDlrobotHTTPServer.max_continuous_failures_count def update_worker_info(self, worker_host_name, worker_ip, exit_code): key = (worker_ip, worker_host_name) if exit_code == 0: self.worker_2_continuous_failures_count[key] = 0 else: self.worker_2_continuous_failures_count[key] += 1 if self.worker_is_banned(worker_ip, worker_host_name): self.send_to_telegram( "too many dlrobot errors from ip {}, hostname={}, the host is banned, " "you have to restart dlrobot_central to unban it".format( worker_ip, worker_host_name)) def register_task_result(self, worker_host_name, worker_ip, project_file, exit_code, result_archive): if self.args.skip_worker_check: remote_call = TRemoteDlrobotCall(worker_ip, project_file) else: try: remote_call = self.pop_project_from_running_tasks( worker_ip, project_file) except: if ipaddress.ip_address(worker_ip).is_private: self.logger.debug( "try to get a result {} from a local ip {}, though this task was not dispatched" .format(project_file, worker_ip)) remote_call = TRemoteDlrobotCall(worker_ip, project_file) else: raise self.update_worker_info(worker_host_name, worker_ip, exit_code) remote_call.worker_host_name = worker_host_name remote_call.exit_code = exit_code remote_call.end_time = int(time.time()) project_folder = self.untar_file(project_file, result_archive) remote_call.calc_project_stats(self.logger, self.web_sites_db, project_folder, self.config) if not TWebSiteReachStatus.can_communicate(remote_call.reach_status): remote_call.exit_code = -1 self.decl_sender.send_declaraion_files_to_other_servers(project_folder) self.save_dlrobot_remote_call(remote_call) self.last_remote_call = remote_call self.logger.debug( "got exitcode {} for task result {} from worker {} (host_name = {})" .format(exit_code, project_file, worker_ip, worker_host_name)) def forget_old_remote_processes(self, current_time): for running_procs in self.worker_2_running_tasks.values(): for i in range(len(running_procs) - 1, -1, -1): remote_call = running_procs[i] elapsed_seconds = current_time - remote_call.start_time if elapsed_seconds > self.config.get_kill_timeout_in_central(): self.logger.debug( "task {} on worker {}(host={}) takes {} seconds, probably it failed, stop waiting for a result" .format(remote_call.web_site, remote_call.worker_ip, remote_call.worker_host_name, elapsed_seconds)) running_procs.pop(i) remote_call.exit_code = 126 self.save_dlrobot_remote_call(remote_call) def forget_remote_processes_for_yandex_worker(self, cloud_id): worker_ip = self.cloud_id_to_worker_ip.get(cloud_id) if worker_ip is None and len(self.cloud_id_to_worker_ip) > 0: self.logger.info( "I do not remember ip for cloud_id {}, cannot delete processes" .format(cloud_id)) return running_procs = self.worker_2_running_tasks.get(worker_ip, list()) for i in range(len(running_procs) - 1, -1, -1): rc = running_procs[i] self.logger.debug( "forget task {} on worker {} since the workstation was stopped" .format(rc.project_file, rc.worker_ip)) running_procs.pop(i) rc.exit_code = 125 self.save_dlrobot_remote_call(rc) if cloud_id in self.cloud_id_to_worker_ip: del self.cloud_id_to_worker_ip[cloud_id] def check_yandex_cloud(self): if not self.args.check_yandex_cloud: return None try: if not check_internet(): self.logger.error( "cannot connect to google dns, probably internet is down") return None for m in TYandexCloud.list_instances(): cloud_id = m['id'] if m['status'] == 'STOPPED': self.forget_remote_processes_for_yandex_worker(cloud_id) self.logger.info( "start yandex cloud worker {}".format(cloud_id)) TYandexCloud.start_yandex_cloud_worker(cloud_id) elif m['status'] == "RUNNING": worker_ip = TYandexCloud.get_worker_ip(m) if self.args.enable_ip_checking: self.permitted_hosts.add(worker_ip) self.cloud_id_to_worker_ip[cloud_id] = worker_ip except Exception as exp: self.logger.error(exp) def check_pdf_conversion_server(self): if not self.args.pdf_conversion_server_checking: return True return not self.conversion_client.server_is_too_busy() def service_actions(self): current_time = time.time() if current_time - self.last_service_action_time_stamp >= self.args.central_heart_rate: self.service_action_count += 1 if self.service_action_count % 10 == 0: self.logger.debug('alive') self.last_service_action_time_stamp = current_time if os.path.exists(self.PITSTOP_FILE): self.stop_process = True self.logger.debug( "stop sending tasks, exit for a pit stop after all tasks complete" ) os.unlink(self.PITSTOP_FILE) if self.stop_process and self.get_running_jobs_count() == 0: self.logger.debug("exit via exception") raise Exception("exit for pit stop") try: self.forget_old_remote_processes(current_time) except Exception as exp: self.logger.error(exp) self.check_yandex_cloud() if not self.check_pdf_conversion_server(): self.logger.debug( "stop sending tasks, because conversion pdf queue length is {}" .format(self.conversion_client. last_pdf_conversion_queue_length)) def get_stats(self): workers = dict((k, list(r.write_to_json() for r in v)) for (k, v) in self.worker_2_running_tasks.items()) stats = { 'running_count': self.get_running_jobs_count(), 'input_tasks': len(self.web_sites_to_process), 'processed_tasks': self.get_processed_jobs_count(), 'worker_2_running_tasks': workers, 'last_service_action_time_stamp': self.last_service_action_time_stamp, 'central_heart_rate': self.args.central_heart_rate, 'register_task_result_error_count': self.register_task_result_error_count } if self.stop_process: stats['stop_process'] = True return stats
def init_conversion(): TDownloadEnv.CONVERSION_CLIENT = TDocConversionClient() TDownloadEnv.CONVERSION_CLIENT.start_conversion_thread()
from ConvStorage.conversion_client import TDocConversionClient from common.logging_wrapper import setup_logging import sys import argparse import os def parse_args(arg_list): parser = argparse.ArgumentParser() parser.add_argument('input', nargs='*') parser.add_argument("--conversion-server", dest='conversion_server', required=False) TDocConversionClient.DECLARATOR_CONV_URL = os.environ.get('DECLARATOR_CONV_URL') # reread for tests return parser.parse_args(arg_list) if __name__ == '__main__': logger = setup_logging(log_file_name="get_docx.log", append_mode=True) client = TDocConversionClient(parse_args(sys.argv[1:]), logger) for sha256 in client.args.input: output_file_path = sha256 + '.docx' if client.retrieve_document(sha256, output_file_path, verbose=True): logger.info("create {}".format(output_file_path)) else: logger.info("cannot find {}".format(sha256)) sys.exit(1)
class TExportHumanFiles: @staticmethod def parse_args(arg_list): parser = argparse.ArgumentParser() parser.add_argument("--table", dest='table', default="declarations_documentfile") parser.add_argument("--document-file-id", dest='document_file_id', required=False) parser.add_argument("--tmp-folder", dest='tmp_folder', default=None) parser.add_argument("--dlrobot-human-json", dest='dlrobot_human_json', default="human_files.json") parser.add_argument("--start-from-an-empty-file", dest='start_from_empty', action="store_true", default=False) parser.add_argument("--max-files-count", dest='max_files_count', type=int) parser.add_argument("--mysql-port", dest='mysql_port', type=int, default=None) parser.add_argument("--pdf-conversion-timeout", dest='pdf_conversion_timeout', default=1*60*60, type=int, help="pdf conversion timeout") parser.add_argument("--pdf-conversion-queue-limit", dest='pdf_conversion_queue_limit', type=int, default=100 * 2 ** 20, help="max sum size of al pdf files that are in pdf conversion queue", required=False) return parser.parse_args(arg_list) def __init__(self, args): self.logger = setup_logging(log_file_name="export_human_files.log") self.args = args if self.args.tmp_folder is None: self.args.tmp_folder = tempfile.mkdtemp("export_human") self.logger.debug("create folder {}".format(self.args.tmp_folder)) else: self.logger.debug("rm folder {}".format(self.args.tmp_folder)) shutil.rmtree(self.args.tmp_folder, ignore_errors=True) os.mkdir(self.args.tmp_folder) self.source_doc_client = TSourceDocClient(TSourceDocClient.parse_args([]), self.logger) self.pdf_conversion_client = TDocConversionClient(TDocConversionClient.parse_args([]), self.logger) self.smart_parser_server_client = TSmartParserCacheClient(TSmartParserCacheClient.parse_args([]), self.logger) self.new_pdfs = set() def __enter__(self): self.pdf_conversion_client.start_conversion_thread() return self def __exit__(self, exc_type, exc_value, traceback): self.pdf_conversion_client.stop_conversion_thread() shutil.rmtree(self.args.tmp_folder, ignore_errors=True) def unarchive(self, input_file): base_name, file_extension = os.path.splitext(os.path.basename(input_file)) output_folder = os.path.dirname(input_file) dearchiver = TDearchiver(self.logger, output_folder) for _, _, filename in dearchiver.dearchive_one_archive(file_extension, input_file, base_name): yield filename def download_file_and_unzip(self, file_url, filename): file_without_extension, extension = os.path.splitext(filename) if not os.path.isfile(filename): self.logger.debug("download {0} to {1}".format(file_url, filename)) result = requests.get(file_url) with open(filename, 'wb') as fd: fd.write(result.content) if extension == '.zip': try: for archive_filename in self.unarchive(filename): yield archive_filename except Exception as e: self.logger.error("cannot unzip {}, exception={}".format(filename, e)) else: yield filename else: if extension == '.zip': for archive_filename in glob.glob("{}_*".format(file_without_extension)): yield archive_filename else: yield filename def get_all_file_sql_records(self): if self.args.mysql_port is None: db = pymysql.connect(db="declarator", user="******", password="******", unix_socket="/var/run/mysqld/mysqld.sock" ) else: db = pymysql.connect(db="declarator", user="******", password="******", port=self.args.mysql_port) cursor = db.cursor() if self.args.document_file_id is not None: where_clause = "where f.id = {}\n".format(self.args.document_file_id) else: where_clause = "" query = (""" select f.id, d.id, f.file, f.link, d.office_id, d.income_year from {} f join declarations_document d on f.document_id=d.id {} """.format(self.args.table, where_clause)) self.logger.debug(query.replace("\n", " ")) cursor.execute(query) for (document_file_id, document_id, filename, link, office_id, income_year) in cursor: if filename is not None and len(filename) > 0: yield document_file_id, document_id, filename, link, office_id, income_year cursor.close() db.close() def download_unzip_and_send_file_source_doc_server(self, declarator_url_path, document_file_id): path, declarator_filename = os.path.split(declarator_url_path) _, ext = os.path.splitext(declarator_filename) ext = ext.lower() temp_file = os.path.join(self.args.tmp_folder, "{}{}".format(document_file_id, ext)) declarator_url = os.path.join(DECLARATOR_DOMAIN, "media", urllib.parse.quote(declarator_url_path)) declarator_url = declarator_url.replace('\\', '/') for file_name in self.download_file_and_unzip(declarator_url, temp_file): self.source_doc_client.send_file(file_name) if file_name.lower().endswith('.pdf'): _, extension = os.path.splitext(file_name) self.pdf_conversion_client.start_conversion_task_if_needed(file_name, extension) self.new_pdfs.add(build_dislosures_sha256(file_name)) else: self.smart_parser_server_client.send_file(file_name) yield file_name, declarator_url self.pdf_conversion_client.wait_all_tasks_to_be_sent() for f in os.listdir(self.args.tmp_folder): os.unlink(os.path.join(self.args.tmp_folder, f)) def fix_list(self, sha256, office_id): fixed_office_id = FIX_LIST.get(sha256) if fixed_office_id is not None: return fixed_office_id else: return office_id def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser() def send_new_pdfs_to_smart_parser(self): self.logger.debug("wait pdf conversion for {} seconds".format(self.args.pdf_conversion_timeout)) self.pdf_conversion_client.wait_doc_conversion_finished(self.args.pdf_conversion_timeout) missed_pdf_count = 0 received_pdf_count = 0 for sha256 in self.new_pdfs: self.logger.debug("try to converted file for {}".format(sha256)) handle, temp_filename = tempfile.mkstemp(suffix=".docx") os.close(handle) if self.pdf_conversion_client.retrieve_document(sha256, temp_filename): received_pdf_count += 1 self.logger.debug("send the converted file to smart parser") self.smart_parser_server_client.send_file(temp_filename) else: self.logger.error("converted file is not received") missed_pdf_count += 1 os.unlink(temp_filename) if missed_pdf_count > 0: self.logger.error('received_pdf_count = {}, missed_pdf_count={}'.format(received_pdf_count, missed_pdf_count))