def get_pdfs(page_dfs,lang): start_time = time.time() try: p_dfs = [] pages = len(page_dfs) block_configs = config.BLOCK_CONFIGS for page_index in range(pages): page_df = page_dfs[page_index] cols = page_df.columns.values.tolist() df = pd.DataFrame(columns=cols) for index, row in page_df.iterrows(): if row['children'] == None: d_tmp = page_df.iloc[index] d_tmp['avg_line_height'] = int(d_tmp['text_height']) df = df.append(d_tmp) else: dfs = process_block(page_df.iloc[index], block_configs,lang) df = df.append(dfs) p_dfs.append(df) except Exception as e : log_error('Error in creating p_dfs', app_context.application_context, e) return None end_time = time.time() elapsed_time = end_time - start_time log_info('Processing of get_pdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(p_dfs), (elapsed_time/len(p_dfs))), app_context.application_context) return p_dfs
def post(self): body = request.get_json() if 'annotationId' not in body.keys() or 'score' not in body.keys() or \ 'saved' not in body.keys(): log_info( 'Missing params in AnnotationTaskSaveAnnotationResource {}'. format(body), LOG_WITHOUT_CONTEXT) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400 try: result = parallelSentenceAnnotationRepo.save_annotation(body) if result == None: res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getres() else: res = CustomResponse(Status.SUCCESS.value, result) return res.getres() except Exception as e: log_exception("Exception at AnnotationTaskSaveAnnotationResource ", LOG_WITHOUT_CONTEXT, e) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400
def extract_pdf_metadata(filename, working_dir, base_dir, jobid): start_time = time.time() pdf_filepath = Path(os.path.join(base_dir, filename)) try: pdf_image_paths = extract_image_paths_from_pdf(pdf_filepath, working_dir) pdf_xml_dir = extract_xml_from_digital_pdf(pdf_filepath, working_dir) except Exception as e: log_error("Service xml_utils", "Error in extracting xml", jobid, e) try: os.system('pdftohtml -c ' + str(pdf_filepath) + ' ' + str(working_dir) + '/') except Exception as e: log_error("Service get_xml", "Error in extracting html", jobid, e) # try: # pdf_bg_image_dir = extract_html_bg_images_from_digital_pdf(pdf_filepath, working_dir) # except Exception as e : # log_error("Service xml_utils", "Error in extracting html of bg images", jobid, e) # end_time = time.time() extraction_time = end_time - start_time xml_files = read_directory_files(pdf_xml_dir, pattern='*.xml') bg_files = None #read_directory_files(pdf_bg_image_dir, pattern='*.png') log_info('Service get_xml', 'Successfully extracted xml, background images of file:', jobid) return xml_files, bg_files, pdf_image_paths
def replace_tags_with_original(text,date_original,url_dict,num_array,num_map): ''' Replaces dates,urls and numbers in the text with the original values in place of the tags ''' try: resultant_str = list() if len(text) == 0: return "" for word in text.split(): if 'UuRrLl' in word: word = url_dict[word] resultant_str.append(word) s = [str(i) for i in resultant_str] res = str(" ".join(s)) log_info("response after url and date replacemnt:{}".format(res),MODULE_CONTEXT) if len(num_map) == 0: ''' handling the case when model outputs a tag which is not in tagged_src(src is without any number''' for char in reversed(hindi_numbers): res = re.sub(r'NnUuMm'+char,"",res) num_map.reverse() for item in num_map: res = res.replace(item['tag'],str(item['no.']),1) res = remove_extra_tags(res) log_info("response after tags replacement:{}".format(res),MODULE_CONTEXT) return res except Exception as e: log_exception("Error in parent except block of replace_tags_with_original_1 function, returning tagged output:{}".format(e),MODULE_CONTEXT,e) return res
def start_kafka(): try: t1 = threading.Thread(target=process_fc_kf, name='keep_on_running') t1.start() log_info("multithread : Kafka running on multithread", None) except Exception as e: log_error("multithread : Error while running custom threads", None, e)
def consume(self): topics = [anu_dp_wf_aligner_in_topic] consumer = self.instantiate(topics) service = AlignmentService() util = AlignmentUtils() rand_str = ''.join( random.choice(string.ascii_letters) for i in range(4)) prefix = "Align-WFM-Consumer(" + rand_str + ")" log_info(prefix + " running.......", None) while True: #thread_count = 0 for msg in consumer: data = {} try: data = msg.value if data: log_info( prefix + " | Received on Topic: " + msg.topic + " | Partition: " + str(msg.partition), data) service.wf_process(data) break except Exception as e: log_exception("Exception while consuming: " + str(e), data, e) util.error_handler("ALIGNER_CONSUMER_ERROR", "Exception while consuming", data, True) break
def push_data_to_queue(self, topic_name, push_data): producer = self.producer_fn() producer.send(topic_name, value=push_data) producer.flush() log_info( "push_data_to_queue : successfully pushed data to output queue", None)
def get_nmt_url_body(self, block_translate_input, nmt_txt): model = block_translate_input["input"]["model"] nmt_in = { "src_list": nmt_txt, "source_language_code": model["source_language_code"], "target_language_code": model["target_language_code"], "model_id": model["model_id"] } try: host = model["connection_details"]["translation"]["host"] api_host = os.environ.get(host, 'NA') endpoint = model["connection_details"]["translation"][ "api_endpoint"] api_endpoint = os.environ.get(endpoint, 'NA') if api_host == "NA" or api_endpoint == "NA": log_info("Falling back to Anuvaad NMT translate URL....", block_translate_input) return nmt_translate_url, nmt_in url = api_host + api_endpoint return url, nmt_in except Exception as e: log_exception( "Exception while fetching API conn details: {}".format(str(e)), block_translate_input, None) log_info("Falling back to Anuvaad NMT translate URL....", block_translate_input) return nmt_translate_url, nmt_in
def process_tokenization_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder) # instatiation of consumer for respective topic try: consumer_class = Consumer(config.input_topic, config.bootstrap_server) consumer = consumer_class.consumer_instantiate() log_info("process_tokenization_kf : trying to receive value from consumer ", None) for msg in consumer: data = msg.value log_info("process_tokenization_kf : received input json from input topic consumer ", data) task_id = str("TOK-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) input_files, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(data) response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime) if "errorID" not in file_value_response.keys(): producer = Producer() producer.push_data_to_queue(config.output_topic, file_value_response, data, task_id) else: log_error("process_tokenization_kf : error send to error handler", data, None) except KafkaConsumerError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, None, None) response_custom.status_code['message'] = str(e) file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True) log_exception("process_tokenization_kf : Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = e.code response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_tokenization_kf : response send to topic %s"%(config.output_topic), data, e)
def post(self): inputs = request.get_json(force=True) response_list = list() if len(inputs) > 0: log_info("Making labse-aligner(Resource) API call", MODULE_CONTEXT) log_info("Complete request input: {}".format(inputs), MODULE_CONTEXT) try: for i in inputs: if all(v in i for v in ["src_phrases", "tgt"]): log_info("Making labse-aligner service call", MODULE_CONTEXT) res = LabseAlignerService.phrase_aligner(i) response_list.append(res) out = CustomResponse(Status.SUCCESS.value, response_list) else: log_info( "Missing mandatory Parameters for labse-aligner:src_phrases or tgt", MODULE_CONTEXT) out = CustomResponse( Status.MANDATORY_PARAM_MISSING.value, []) return out.getres() except Exception as e: status = Status.SYSTEM_ERR.value status['why'] = str(e) out = CustomResponse(status, []) return out.getres() else: log_info("null inputs in request in labse-aligner API", MODULE_CONTEXT) out = CustomResponse(Status.INVALID_API_REQUEST.value, None) return out.getres()
def vision_ocr_request_worker(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) log_info("vision_ocr_request_worker : starting thread ", LOG_WITHOUT_CONTEXT) while True: data = processQueue.get(block=True) ################# task_id = str("vision_ocr" + str(time.time()).replace('.', '')) ################### task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data) log_info("vision_ocr_request_worker processing -- received message "+str(jobid), data) try: response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime, False) if file_value_response != None: if "errorID" not in file_value_response.keys(): push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id,data) log_info("vision_ocr_request_worker : response send to topic %s"%(config.output_topic), LOG_WITHOUT_CONTEXT) else: log_info("vision_ocr_request_worker : error send to error handler", data) log_info('vision_ocr_request_worker - request in internal queue {}'.format(Queue.qsize()), data) processQueue.task_done() except Exception as e: log_exception("vision_ocr_request_worker ", LOG_WITHOUT_CONTEXT, e) controlQueue.put(1)
def consumer_instantiate(self): try: consumer = KafkaConsumer(self.topic_name, bootstrap_servers = list((self.server_address).split(",")), auto_offset_reset = 'latest', group_id = config.CONSUMER_GROUP, enable_auto_commit=True) log_info("consumer_instantiate : Consumer returned for topic: %s"%(self.topic_name), None) return consumer except Exception as e: log_exception("consumer_instantiate : error occured for consumer topic: %s"%(self.topic_name), None, e)
def handle_sentences_wo_stop(language, sentence_array): ''' Handles sentences in the array which do not have a sentence ending puncuation by adding it. Used in batch translation. ''' try: if language is None: return sentence_array, [] else: log_info("Inside handle_sentences_wo_stop", MODULE_CONTEXT) stop_puncs = misc.get_language_stop_puncs(language) full_stop_or_purnviram = stop_puncs[0] sent_indices_wo_stop = [] for i, sentence in enumerate(sentence_array): if misc.is_sentence_wo_stop(sentence, stop_puncs): sent_indices_wo_stop.append(i) sentence_array[i] = misc.add_stop_punc( sentence_array[i], full_stop_or_purnviram) return sentence_array, sent_indices_wo_stop except Exception as e: log_exception("Error in handle_sentences_wo_stop: {}".format(e), MODULE_CONTEXT, e) return sentence_array, []
def get_segmented_regions(app_context,base_dir) : try: files = get_files(app_context.application_context) output = [] for index,file in enumerate(files): file = get_json(base_dir, file['file']['name']) file_properties = File(file) pages = file_properties.get_pages() page_counts = len(pages) start_time = time.time() for page_index in range(page_counts): print('processing for page : ', page_index) # page_lines = file_properties.get_lines(page_index) # page_regions = file_properties.get_regions(page_index) # page_words = file_properties.get_words(page_index) #font_meta = font_properties(file_properties.get_page(page_index)) font_meta = [] #page_regions = region_unifier.region_unifier(page_lines,page_regions) #file_properties.set_regions(page_index, segment_regions(page_words,page_lines,page_regions)) file_properties.set_font_properties(page_index,font_meta) output.append(file_properties.get_file()) output[index]['status']= {'message':"block-segmenter successful"} end_time = time.time() extraction_time = (end_time - start_time)/page_counts log_info('block segmentation per page completed in {}'.format(extraction_time), app_context.application_context) app_context.application_context["outputs"] =output log_info("successfully completed block segmentation", None) except Exception as e: log_exception("Error occured during block segmentation ", app_context.application_context, e) return None return app_context.application_context
def get_stored_hypothesis_ch(self, text_list, text_translate_input): sent_map, ch_res, text_for_nmt, ch_response = {}, {}, [], [] for text in text_list: sent_map[text["s_id"]] = text api_input = {"sentences": list(sent_map.keys())} api_res = utils.call_api(sentence_fetch_url, "POST", api_input, None, text_translate_input["metadata"]["userID"]) if api_res: if api_res["data"]: ch_response = api_res["data"] if ch_response: for translation in ch_response: if translation["s_id"] in sent_map.keys(): tgt_list = [] if str(translation["tgt"]).startswith( str(sent_map[translation["s_id"]] ["taggedPrefix"])): tgt_list.append(translation["tgt"]) if str(translation["s0_tgt"]).startswith( str(sent_map[translation["s_id"]] ["taggedPrefix"])): tgt_list.append(translation["s0_tgt"]) if tgt_list: translation["tgt"] = tgt_list ch_res[translation["s_id"]] = translation for s_id in sent_map.keys(): if s_id not in ch_res.keys(): text_for_nmt.append(sent_map[s_id]) log_info( "Translation fetched from CH! Count: " + str(len(ch_res.keys())), text_translate_input) return text_for_nmt, list(ch_res.values())
def save_page_res(res, file_name): try: tmp_file = copy.deepcopy(res['rsp']) del tmp_file['input'] tmp_file['files'] = res['rsp']['outputs'] del tmp_file['outputs'] json_file_name = file_name['output'][0]['outputFile'] for file in [tmp_file]: recordID = file['jobID'] + '|' + json_file_name page_idx = 0 total_pages = len(file['files'][0]['pages']) file['files'][0]['config'] = copy.deepcopy( file['files'][0]['config']['OCR']) save_file = copy.deepcopy(file) save_file['recordID'] = recordID while page_idx < total_pages: pages = file['files'][0]['pages'][page_idx:page_idx + SAVE_NO_PAGE] save_file['files'][0]['pages'] = pages page_idx = page_idx + SAVE_NO_PAGE log_info( "started saving data to database with record id: " + str(recordID), app_context.application_context) rsp = requests.post(SAVE_URL, json=save_file) log_info( "successfully saved data to database with record id: " + str(recordID), app_context.application_context) except Exception as e: log_exception("Error occured during saving page response", app_context.application_context, e)
def return_loaded_models(self, model_paths, ids): loaded_models = {} for i, path in enumerate(model_paths): translator = ctranslate2.Translator(path, device="auto") loaded_models[ids[i]] = translator log_info("Model Loaded: {}".format(ids[i]), MODULE_CONTEXT) return loaded_models
def process_input(app_context, base_dir): try: files = get_files(app_context.application_context) output_files = [] langs = [] for index, file in enumerate(files): file_output = {"status": {}} file = get_json(base_dir, file['file']['name'])[0] file_properties = File(file) if "page_info" in file.keys(): page_paths = file_properties.get_pages() else: page_paths = doc_pre_processing(file['file']['name'], config.BASE_DIR) page_res = text_extraction(file_properties, page_paths, file) output_files.append(page_res) langs.append(file_properties.get_language()) app_context.application_context["outputs"] = output_files log_info("successfully completed google vision ocr", None) except Exception as e: log_exception("Error occured during google vision ocr", app_context.application_context, e) return None, None return app_context.application_context, langs
def call_api(self, uri, method, api_input, params, user_id): try: log_info("URI: " + uri, None) response = None if method == "POST": api_headers = { 'userid': user_id, 'x-user-id': user_id, 'Content-Type': 'application/json' } response = requests.post(url=uri, json=api_input, headers=api_headers) elif method == "GET": api_headers = {'userid': user_id} response = requests.get(url=uri, params=params, headers=api_headers) if response is not None: if response.text is not None: log_info(response.text, None) return json.loads(response.text) else: log_error("API response was None, URI: " + str(uri), api_input, None) return None else: log_error("API call failed! URI: " + str(uri), api_input, None) return None except Exception as e: log_exception("Exception while making the api call: " + str(e), api_input, e) return None
def extract_images(app_context, base_dir): files = get_files(app_context.application_context) file_images = [] try: for file in files: file_properties = File(file) file_format = file_properties.get_format() if file_format in ['PDF', 'pdf']: filename = file_properties.get_name() image_paths = extract_pdf_images(filename, base_dir) file_images.append(image_paths) else: if file_format in [ 'PNG', 'JPEG', 'BMP', 'jpg', 'png', 'bmp', 'jpeg' ]: filename = file_properties.get_name() image_paths = [os.path.join(base_dir, filename)] file_images.append(image_paths) else: log_info( "currently we do not support {} files .".format( file_format), app_context.application_context) return None except Exception as e: log_error('error extracting images of' + str(e), app_context.application_context, e) return None return file_images
def post(self): body = request.get_json() if "keys" not in body or not body["keys"]: res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400 keys = body["keys"] log_info("Fetching sentences from redis store", AppContext.getContext()) try: result = sentenceRepo.get_sentences_from_store(keys) if result == None: res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400 res = CustomResponse(Status.SUCCESS.value, result) return res.getres() except Exception as e: log_exception( "Exception while fetching sentences from redis store ", AppContext.getContext(), e) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400
def create_translated_txt_file(self, record_id, dataframes, page_layout): try: out_translated_txt_filename = os.path.splitext(os.path.basename(record_id.split('|')[0]))[0] + str(uuid.uuid4()) + '_translated.txt' output_filepath_txt = os.path.join(self.DOWNLOAD_FOLDER , out_translated_txt_filename) out_txt_file_write = open(output_filepath_txt, 'w') page_width = page_layout['page_width'] max_chars_in_line = int(page_width/13) for index, df in enumerate(dataframes): for index, row in df.iterrows(): if row['text'] != None and index+1 < df.shape[0]: extra_spaces = int(row['text_left']/13.5) write_str = re.sub(r'^', ' '*extra_spaces, row['text']) if row['text_top'] != df.iloc[index+1]['text_top']: if len(write_str) < max_chars_in_line: out_txt_file_write.write("%s\n"%write_str) else: sub_string_list = self.break_large_sentence(write_str, max_chars_in_line) for item in sub_string_list: out_txt_file_write.write("%s\n"%item) else: same_line_index = 0 same_line_status = bool(row['text_top'] == df.iloc[index+same_line_index+1]['text_top']) while same_line_status: onwards_line_space = int((df.iloc[index+same_line_index+1]['text_width'] - df.iloc[index]['text_left'] - df.iloc[index]['text_width'])/13.5) write_str += ' '*onwards_line_space + df.iloc[index+same_line_index+1]['text'] same_line_index += 1 same_line_status = bool(row['text_top'] == df.iloc[index+same_line_index+1]['text_top']) out_txt_file_write.write("%s\n"%write_str) out_txt_file_write.close() log_info("txt file write completed!! filename: %s"%out_translated_txt_filename, MODULE_CONTEXT) return out_translated_txt_filename except Exception as e: log_exception("txt file formation failed", MODULE_CONTEXT, e)
def post(self): body = request.json log_info('received request for WordSaveResource', AppContext.getContext()) if body == None: res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400 if 'words' not in body: res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400 for word in body['words']: if word['locale'] != 'en': res = CustomResponse( Status.ERR_ENGLISH_MANDATORY_WHILE_SAVING.value, None) return res.getresjson(), 400 result = wordRepo.store(body['words']) if result == False: res = CustomResponse(Status.ERR_SCHEMA_VALIDATION.value, None) return res.getresjson(), 400 res = CustomResponse(Status.SUCCESS.value, None) return res.getres()
def run(self): obj = {"metadata": {"module": module_name}} rand_str = ''.join(random.choice(string.ascii_letters) for i in range(4)) prefix = "TranslatorJobsCleaner(" + rand_str + ")" log_info(prefix + " -- AJM Deployed, TranslatorJobsCleaner running......", obj) translator_utils = TranslatorCronUtils() run = 0 while not self.stopped.wait(eval(str(jc_cron_interval_sec))): try: records = translator_utils.find_all(False) deleted = 0 for record in records: try: job_start_time = record["transInput"]["taskStartTime"] diff = eval(str(time.time()).replace('.', '')[0:13]) - job_start_time if (diff / 1000) > eval(str(jc_job_delete_interval_sec)): translator_utils.delete(record["jobID"]) translator_utils.delete_batches(record["jobID"]) translator_utils.delete_pages(record["recordID"]) deleted += 1 except Exception as e: log_exception(prefix + " -- Exception in JobsCleaner for record: " + record["recordID"], record["transInput"], e) log_exception(prefix + " -- Exception - " + str(e), record["transInput"], e) continue log_info(prefix + " -- Run: " + str(run) + " | Deleted: " + str(deleted), obj) run += 1 except Exception as e: log_exception(prefix + " -- Run: " + str(run) + " | Exception: " + str(e), obj, e) run += 1
def nonwf_response(self): log_info("non workflow response started the response generation", app_context.application_context) input_files = self.json_data['files'] error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) try: error_validator.inputfile_list_empty(input_files) output_file_response = list() for item in input_files: input_filename, in_file_type, in_locale = file_ops.accessing_files(item) output_json_data = DocumentStructure(None, input_filename) output_filename_json = file_ops.writing_json_file(i, output_json_data, self.DOWNLOAD_FOLDER) file_res = file_ops.one_filename_response(input_filename, output_filename_json, in_locale, in_file_type) output_file_response.append(file_res) response_true = Status.SUCCESS.value response_true['output'] = output_file_response log_info("non workflow_response successfully generated response for rest server", app_context.application_context) response_true = copy.deepcopy(response_true) return response_true except FileErrors as e: response_custom = Status.ERR_STATUS.value response_custom['message'] = e.message response = file_ops.error_handler(response_custom, e.code, False) log_exception("non workflow_response some error occured while validating file", app_context.application_context, e) response = copy.deepcopy(response) return response except ServiceError as e: response_custom = Status.ERR_STATUS.value response_custom['message'] = str(e) response = file_ops.error_handler(response_custom, "SERVICE_ERROR", False) log_exception("non workflow_response Something went wrong during pdf to block conversion.", app_context.application_context, e) response = copy.deepcopy(response) return response
def post(self): json_data = request.get_json(force=True) app_context.init() app_context.application_context = json_data log_info( "Resource Layout_Detector_WF Layout_Detector service started", app_context.application_context) task_id = str("LD-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) #json_data = request.get_json(force = True) try: error_validator = ValidationResponse(DOWNLOAD_FOLDER) if error_validator.format_error(json_data) is True: response_gen = Response(json_data, DOWNLOAD_FOLDER) response = response_gen.workflow_response( task_id, task_starttime) log_info( "Resource Layout_Detector_WF Layout_Detector api response completed", app_context.application_context) return jsonify(response) except FormatError as e: log_error( "Resource Layout_Detector_WF Input json format is not correct or dict_key is missing", app_context.application_context, e) return Status.ERR_request_input_format.value
def process_info(app_context, base_dir): try: files = get_files(app_context.application_context) file_images = [] output = [] for index, file_new in enumerate(files): start_time = time.time() file = get_json(file_new['file']['name'], base_dir)[0] file_properties = File(file) ocr_level, lang = get_ocr_config(file_new, file_properties.get_pages()) file = preprocess_file(file_properties, lang, ocr_level) file['file'] = file_new['file'] file['config'] = file_new['config'] output.append(file) output[index]['status'] = { 'code': 200, 'message': "tesseract ocr successful" } end_time = time.time() extraction_time = (end_time - start_time) / len( file_properties.get_pages()) log_info( 'tesseract ocr per page completed in {}'.format( extraction_time), app_context.application_context) app_context.application_context["outputs"] = output log_info("successfully completed tesseract ocr", None) except Exception as e: log_exception("Error occured during tesseract ocr ", app_context.application_context, e) return None return app_context.application_context
def get_nmt_url_body(self, text_translate_input, text_for_nmt): model = text_translate_input["input"]["model"] text_nmt = [] for text in text_for_nmt: text_nmt.append({ "s_id": text["s_id"], "id": model["model_id"], "src": text["src"], "target_prefix": text["taggedPrefix"] }) try: host = model["connection_details"]["interactive"]["host"] api_host = os.environ.get(host, 'NA') endpoint = model["connection_details"]["interactive"][ "api_endpoint"] api_endpoint = os.environ.get(endpoint, 'NA') if api_host == "NA" or api_endpoint == "NA": log_info("Falling back to Anuvaad NMT translate URL....", text_translate_input) return nmt_it_url, text_nmt url = api_host + api_endpoint return url, text_nmt except Exception as e: log_exception( "Exception while fetching API conn details: {}".format(str(e)), text_translate_input, None) log_info("Falling back to Anuvaad NMT translate URL....", text_translate_input) return nmt_it_url, text_nmt
def encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,max_batch_size,batch_type,input_subwords_list,output_subwords_list,score_list): try: log_info("Inside encode_translate_decode function",MODULE_CONTEXT) start_encoding = time.time() input_subwords_list = [str(sp.encode_line(sp_encoder,sent)) for sent in input_sentence_array_prepd] input_final_array = [format_converter(input_subwords) for input_subwords in input_subwords_list] time_encoding = time.time() - start_encoding start_translating = time.time() m_out = translator.translate_batch(input_final_array,beam_size = 5,num_hypotheses=1,max_batch_size=max_batch_size,batch_type=batch_type) time_translating = time.time() - start_translating translation_array = [None] * len(output_subwords_list) start_decoding = time.time() for i, _ in enumerate(output_subwords_list): output_subwords_list[i] = " ".join(m_out[i][0]['tokens']) score_list[i] = m_out[i][0]['score'] translation_array[i] = multiple_hypothesis_decoding(m_out[i],sp_decoder)[0] time_decoding = time.time() - start_decoding return translation_array, input_subwords_list, output_subwords_list, score_list, time_encoding, time_translating, \ time_decoding except ServerModelError as e: log_exception("ServerModelError error in encode_translate_decode: {} and {}".format(e,sys.exc_info()[0]),MODULE_CONTEXT,e) raise except Exception as e: log_exception("Unexpexcted error in encode_translate_decode: {} and {}".format(e,sys.exc_info()[0]),MODULE_CONTEXT,e) raise
def get_hdfs(in_dfs, header_region, footer_region): start_time = time.time() try: pages = len(in_dfs) multiple_pages = False if pages > 1: multiple_pages =True h_dfs = [] document_configs = config.DOCUMENT_CONFIGS for page_index in range(pages): page_df = in_dfs[page_index] if multiple_pages : page_df = tag_heaader_footer_attrib(header_region , footer_region,page_df) h_df = merge_horizontal_blocks(page_df, document_configs, debug=False) h_dfs.append(h_df) except Exception as e : log_error('Error in creating h_dfs' +str(e), app_context.application_context, e) return None end_time = time.time() elapsed_time = end_time - start_time log_info('Processing of get_hdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(in_dfs), (elapsed_time/len(in_dfs))), app_context.application_context) return h_dfs