def validate_tool_response(self, tool_response, tool_details, wf_input): if not tool_response: log_error("Error from the tool: " + str(tool_details["name"]), wf_input, None) error = post_error( "ERROR_FROM_TOOL", "Error from the tool: " + str(tool_details["name"]), None) client_output = self.get_wf_details_sync(wf_input, None, True, error) self.update_job_details(client_output, False) log_info("Job FAILED, jobID: " + str(wf_input["jobID"]), wf_input) return client_output else: fail_msg = None if 'error' in tool_response.keys(): if tool_response["error"]: fail_msg = "Error from the tool: " + str( tool_details["name"]) + " | Cause: " + str( tool_response["error"]) elif 'http' in tool_response.keys(): if 'status' in tool_response["http"]: if tool_response["http"]["status"] != 200: fail_msg = "Error from the tool: " + str( tool_details["name"]) + " | Cause: " + str( tool_response["why"]) if fail_msg: log_error(fail_msg, wf_input, None) error = post_error("ERROR_FROM_TOOL", fail_msg, None) client_output = self.get_wf_details_sync( wf_input, None, True, error) self.update_job_details(client_output, False) log_info("Job FAILED, jobID: " + str(wf_input["jobID"]), wf_input) return client_output
def page_processor(self, page, record_id, file, tmx_present, nonmt_user, tmx_file_cache, translate_wf_input): batches, pw_dict, bw_data = self.fetch_batches_of_sentences( file, record_id, page, tmx_present, tmx_file_cache, False, translate_wf_input) batches_count, sentences_count, tmx_count = 0, 0, 0 if not batches: log_error("No batches obtained for page: " + str(page["page_no"]), translate_wf_input, None) return batches_count, sentences_count, tmx_count batches_count, tmx_count = len(batches), pw_dict["tmx_count"] partition = random.choice( list(range(0, total_no_of_partitions) )) # So that all batches of a page go to the same consumer topic = self.get_nmt_in_topic(translate_wf_input, file) for batch_id in batches.keys(): batch = batches[batch_id] record_id_enhanced = record_id + "|" + str(len(batch)) nmt_in = { "record_id": record_id_enhanced, "id": file["model"]["model_id"], "message": batch } if nonmt_user: producer.produce(nmt_in, anu_translator_nonmt_topic, partition) else: producer.produce(nmt_in, topic, partition) log_info( "B_ID: " + batch_id + " | SENTENCES: " + str(len(batch)) + " | COMPUTED: " + str(bw_data[batch_id]["computed"]) + " | TMX: " + str(bw_data[batch_id]["tmx_count"]), translate_wf_input) sentences_count += len(batch) return batches_count, sentences_count, tmx_count
def tesseract_ocr(pdf_image_paths, desired_width, desired_height, dfs, lang, jobid): log_info("Service ocr_text_utilities", "tesseract ocr started ===>", jobid) try: start_time = time.time() ocr_dfs = [] for i, df in enumerate(dfs): filepath = pdf_image_paths[i] df_updated = extract_text_from_image(filepath, desired_width, desired_height, df, lang) ocr_dfs.append(df_updated) print('OCR on page : ', i) end_time = time.time() extraction_time = end_time - start_time except Exception as e: log_error("Service ocr_text_utilities", "Error in tesseract ocr", jobid, e) log_info("Service ocr_text_utilities", "tesseract ocr successfully completed", jobid) return ocr_dfs
def get_pdfs(page_dfs,lang): start_time = time.time() try: p_dfs = [] pages = len(page_dfs) block_configs = config.BLOCK_CONFIGS for page_index in range(pages): page_df = page_dfs[page_index] cols = page_df.columns.values.tolist() df = pd.DataFrame(columns=cols) for index, row in page_df.iterrows(): if row['children'] == None: d_tmp = page_df.iloc[index] d_tmp['avg_line_height'] = int(d_tmp['text_height']) df = df.append(d_tmp) else: dfs = process_block(page_df.iloc[index], block_configs,lang) df = df.append(dfs) p_dfs.append(df) except Exception as e : log_error('Error in creating p_dfs', app_context.application_context, e) return None end_time = time.time() elapsed_time = end_time - start_time log_info('Processing of get_pdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(p_dfs), (elapsed_time/len(p_dfs))), app_context.application_context) return p_dfs
def post(self): json_data = request.get_json(force=True) app_context.init() app_context.application_context = json_data log_info( "Resource Layout_Detector_WF Layout_Detector service started", app_context.application_context) task_id = str("LD-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) #json_data = request.get_json(force = True) try: error_validator = ValidationResponse(DOWNLOAD_FOLDER) if error_validator.format_error(json_data) is True: response_gen = Response(json_data, DOWNLOAD_FOLDER) response = response_gen.workflow_response( task_id, task_starttime) log_info( "Resource Layout_Detector_WF Layout_Detector api response completed", app_context.application_context) return jsonify(response) except FormatError as e: log_error( "Resource Layout_Detector_WF Input json format is not correct or dict_key is missing", app_context.application_context, e) return Status.ERR_request_input_format.value
def extract_pdf_metadata(filename, working_dir, base_dir): start_time = time.time() pdf_filepath = os.path.join(base_dir, filename) log_info('filepath {}, working_dir {}'.format(pdf_filepath, working_dir), app_context.application_context) try: pdf_image_paths = extract_image_paths_from_pdf(pdf_filepath, working_dir) pdf_xml_filepath = extract_xml_path_from_digital_pdf(pdf_filepath, working_dir) except Exception as e: log_error('error extracting xml information of {}'.format(pdf_filepath), app_context.application_context, e) return None, None, None log_info('Extracting xml of {}'.format(pdf_filepath), app_context.application_context) try: pdf_bg_img_filepaths = extract_html_bg_image_paths_from_digital_pdf(pdf_filepath, working_dir) except Exception as e: log_error('unable to extract background images of {}'.format(pdf_filepath), app_context.application_context, None) return None, None, None log_info('Extracting background images of {}'.format(pdf_filepath), app_context.application_context) end_time = time.time() extraction_time = end_time - start_time log_info('Extraction of {} completed in {}'.format(pdf_filepath, extraction_time), app_context.application_context) return pdf_xml_filepath, pdf_image_paths, pdf_bg_img_filepaths
def extract_pdf_metadata(filename, working_dir, base_dir, jobid): start_time = time.time() pdf_filepath = Path(os.path.join(base_dir, filename)) try: pdf_image_paths = extract_image_paths_from_pdf(pdf_filepath, working_dir) pdf_xml_dir = extract_xml_from_digital_pdf(pdf_filepath, working_dir) except Exception as e: log_error("Service xml_utils", "Error in extracting xml", jobid, e) try: os.system('pdftohtml -c ' + str(pdf_filepath) + ' ' + str(working_dir) + '/') except Exception as e: log_error("Service get_xml", "Error in extracting html", jobid, e) # try: # pdf_bg_image_dir = extract_html_bg_images_from_digital_pdf(pdf_filepath, working_dir) # except Exception as e : # log_error("Service xml_utils", "Error in extracting html of bg images", jobid, e) # end_time = time.time() extraction_time = end_time - start_time xml_files = read_directory_files(pdf_xml_dir, pattern='*.xml') bg_files = None #read_directory_files(pdf_bg_image_dir, pattern='*.png') log_info('Service get_xml', 'Successfully extracted xml, background images of file:', jobid) return xml_files, bg_files, pdf_image_paths
def extract_images(app_context, base_dir): files = get_files(app_context.application_context) file_images = [] try: for file in files: file_properties = File(file) file_format = file_properties.get_format() if file_format in ['PDF', 'pdf']: filename = file_properties.get_name() image_paths = extract_pdf_images(filename, base_dir) file_images.append(image_paths) else: if file_format in [ 'PNG', 'JPEG', 'BMP', 'jpg', 'png', 'bmp', 'jpeg' ]: filename = file_properties.get_name() image_paths = [os.path.join(base_dir, filename)] file_images.append(image_paths) else: log_info( "currently we do not support {} files .".format( file_format), app_context.application_context) return None except Exception as e: log_error('error extracting images of' + str(e), app_context.application_context, e) return None return file_images
def prepocess_pdf_regions(pdf_data, flags, config=preprocess_config): xml_dfs = pdf_data['in_dfs'] #if flags['doc_class'] == 'class_1': page_height = pdf_data['page_height'] #else: # page_height = pdf_data['pdf_image_height'] #header_region = None #footer_region =None #if len(xml_dfs) > 1 : try: start_time = time.time() header_region = find_header(xml_dfs, page_height, config) footer_region = find_footer(xml_dfs, page_height, config) end_time = time.time() - start_time log_info( 'Header Footer detection completed successfully in time {} '. format(end_time), app_context.application_context) log_info('Footers found {} '.format(len(footer_region)), app_context.application_context) log_info('Headers found {}'.format(len(header_region)), app_context.application_context) pdf_data['header_region'], pdf_data[ 'footer_region'] = header_region, footer_region return pdf_data except Exception as e: log_error('Error in finding header/footer ' + e, app_context.application_context, e) pdf_data['header_region'], pdf_data['footer_region'] = pd.DataFrame( ), pd.DataFrame() return pdf_data return header_region, footer_region
def extract_pdf_images(filename, base_dir): start_time = time.time() working_dir, ret = create_pdf_processing_paths(filename, base_dir) pdf_filepath = os.path.join(base_dir, filename) log_info('filepath {}, working_dir {}'.format(pdf_filepath, working_dir), app_context.application_context) try: pdf_image_paths = extract_image_paths_from_pdf(pdf_filepath, working_dir) log_info('Extracting images of {}'.format(pdf_filepath), app_context.application_context) except Exception as e: log_error('error extracting images of {}'.format(pdf_filepath), app_context.application_context, e) return None end_time = time.time() extraction_time = end_time - start_time log_info( 'Extraction of {} completed in {}'.format(pdf_filepath, extraction_time), app_context.application_context) return pdf_image_paths
def call_api(self, uri, method, api_input, params, user_id): try: log_info("URI: " + uri, None) response = None if method == "POST": api_headers = { 'userid': user_id, 'x-user-id': user_id, 'Content-Type': 'application/json' } response = requests.post(url=uri, json=api_input, headers=api_headers) elif method == "GET": api_headers = {'userid': user_id} response = requests.get(url=uri, params=params, headers=api_headers) if response is not None: if response.text is not None: log_info(response.text, None) return json.loads(response.text) else: log_error("API response was None, URI: " + str(uri), api_input, None) return None else: log_error("API call failed! URI: " + str(uri), api_input, None) return None except Exception as e: log_exception("Exception while making the api call: " + str(e), api_input, e) return None
def process_tokenization_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder) # instatiation of consumer for respective topic try: consumer_class = Consumer(config.input_topic, config.bootstrap_server) consumer = consumer_class.consumer_instantiate() log_info("process_tokenization_kf : trying to receive value from consumer ", None) for msg in consumer: data = msg.value log_info("process_tokenization_kf : received input json from input topic consumer ", data) task_id = str("TOK-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) input_files, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(data) response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime) if "errorID" not in file_value_response.keys(): producer = Producer() producer.push_data_to_queue(config.output_topic, file_value_response, data, task_id) else: log_error("process_tokenization_kf : error send to error handler", data, None) except KafkaConsumerError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, None, None) response_custom.status_code['message'] = str(e) file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True) log_exception("process_tokenization_kf : Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = e.code response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_tokenization_kf : response send to topic %s"%(config.output_topic), data, e)
def start_kafka(): try: t1 = threading.Thread(target=process_fc_kf, name='keep_on_running') t1.start() log_info("multithread : Kafka running on multithread", None) except Exception as e: log_error("multithread : Error while running custom threads", None, e)
def doc_pre_processing(filename, base_dir, jobid): ''' Preprocessing on input pdf to get: - xml files - images - background images - header and footer regions ''' log_info("Service main", "document preprocessing started ===>", jobid) img_dfs, xml_dfs, page_width, page_height, working_dir, pdf_image_paths = get_xml.process_input_pdf( filename, base_dir, jobid) multiple_pages = False pages = len(xml_dfs) if pages > 1: multiple_pages = True try: header_region, footer_region = prepocess_pdf_regions( xml_dfs, page_height) except Exception as e: log_error("Service prepocess", "Error in finding footer and header region", jobid, e) log_info("Service main", "document preprocessing successfully completed", jobid) return img_dfs, xml_dfs, pages, working_dir, header_region, footer_region, multiple_pages, page_width, page_height, pdf_image_paths
def df_to_json(p_df): page_data = [] try: p_df = p_df.where(p_df.notnull(), None) if len(p_df) > 0 : p_df = get_xml.drop_cols(p_df) for index ,row in p_df.iterrows(): block = row.to_dict() for key in block.keys(): if key in ['text']: block[key] = block[key] if key not in ['text', 'children']: try : block[key] = int(block[key]) except : pass if block['attrib'] == "TABLE": pass else : if 'children' in list(block.keys()): if block['children'] == None : pass else : block['children'] = df_to_json(pd.read_json(row['children'])) page_data.append(block) return page_data except Exception as e : log_error("Service get_response", "Error in converting dataframe to json", None, e)
def get_hdfs(in_dfs, header_region, footer_region): start_time = time.time() try: pages = len(in_dfs) multiple_pages = False if pages > 1: multiple_pages =True h_dfs = [] document_configs = config.DOCUMENT_CONFIGS for page_index in range(pages): page_df = in_dfs[page_index] if multiple_pages : page_df = tag_heaader_footer_attrib(header_region , footer_region,page_df) h_df = merge_horizontal_blocks(page_df, document_configs, debug=False) h_dfs.append(h_df) except Exception as e : log_error('Error in creating h_dfs' +str(e), app_context.application_context, e) return None end_time = time.time() elapsed_time = end_time - start_time log_info('Processing of get_hdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(in_dfs), (elapsed_time/len(in_dfs))), app_context.application_context) return h_dfs
def resize_image(image_paths): ''' Google ocr will not process an image if it has more than 65M pixels ''' max_res = 65_000_000 try: if image_paths is not None and len(image_paths) > 0: for path in image_paths: img = cv2.imread(path) img_res = img.shape[0] * img.shape[1] if img_res >= max_res: log_info( "Resolution of pdf too high scaling down to enable OCR", app_context.application_context) scaling_factor = math.sqrt(max_res / img_res) img = cv2.resize(img, None, fx=scaling_factor, fy=scaling_factor, interpolation=cv2.INTER_AREA) cv2.imwrite(path, img) except Exception as e: log_error('error in resizing images ' + str(e), app_context.application_context, e)
def start_kafka(): try: t1 = threading.Thread(target=process_pdf_kf, name='keep_on_running') t1.start() log_info("multithread", "Kafka running on multithread", None) except Exception as e: log_error("threading", "ERROR WHILE RUNNING CUSTOM THREADS ", None, e)
def children_condition(children): try: return merge_children(children) except Exception as e: log_error("Error breaking regions type1 " + str(e), app_context.application_context, e) return None
def fetch_batches_of_blocks(self, record_id, page_no, text_blocks, file, tmx_present, tmx_file_cache, third_party, translate_wf_input): batch_id, tmx_count, computed = str(uuid.uuid4()), 0, 0 sentences_for_trans, batch_wise_tmx, bw_tmx_count, bw_computed_count = {}, {}, 0, 0 for block in text_blocks: block_id = block["block_id"] if 'tokenized_sentences' in block.keys(): for sentence in block["tokenized_sentences"]: tmx_phrases = [] if tmx_present: tmx_phrases, res_dict = self.fetch_tmx( sentence["src"], file, tmx_present, tmx_file_cache, translate_wf_input) bw_tmx_count += len(tmx_phrases) bw_computed_count += res_dict["computed"] node_id = str(record_id) + "|" + str(page_no) + "|" + str( block_id) if not third_party: sent_nmt_in = { "src": sentence["src"], "s_id": sentence["s_id"], "n_id": node_id, "batch_id": batch_id, "tmx_phrases": tmx_phrases } else: s_id = node_id + "xxx" + batch_id + "xxx" + sentence[ "s_id"] sent_nmt_in = { "src": sentence["src"], "s_id": s_id, "tmx_phrases": tmx_phrases } if batch_id in sentences_for_trans.keys(): sentence_list = sentences_for_trans[batch_id] sentence_list.append(sent_nmt_in) sentences_for_trans[batch_id] = sentence_list else: sentence_list = [sent_nmt_in] sentences_for_trans[batch_id] = sentence_list batch_wise_tmx[batch_id] = { "tmx_count": bw_tmx_count, "computed": bw_computed_count } if len(sentences_for_trans[batch_id] ) == nmt_max_batch_size: batch_id, bw_tmx_count, bw_computed_count = str( uuid.uuid4()), 0, 0 else: log_error( "There are no tokenised sentences in block: " + str(block_id), translate_wf_input, None) continue for batch in batch_wise_tmx.keys(): tmx_count += batch_wise_tmx[batch]["tmx_count"] return sentences_for_trans, {"tmx_count": tmx_count}, batch_wise_tmx
def start_kafka(): try: t1 = threading.Thread(target=kf_service.process_kf_request_payload, name='sentence-grader-kafka-worker-thread') t1.start() log_info("starting kafka consumer thread", LOG_WITHOUT_CONTEXT) except Exception as e: log_error("threading ERROR WHILE RUNNING CUSTOM THREADS ", LOG_WITHOUT_CONTEXT, e)
def process_bg_image(bg_img): bg_image_data = [] try: with open(bg_img, "rb") as img_file: img_base64 = base64.b64encode(img_file.read()) img_base64 = img_base64.decode('ascii') bg_image_data.append(img_base64) return bg_image_data except Exception as e : log_error("Service get_response", "Error in processing bg_image", None, e)
def break_block(v_block): try: block_configs = config.BLOCK_CONFIGS if v_block['children'] != None and len(v_block['children'] ) < 2 : print(v_block) return [v_block] else: return break_paragraph(v_block, block_configs) except Exception as e : log_error('Error in breaking blocks' + str(e), app_context.application_context, e) return None
def get_vdfs(pages, h_dfs, document_configs, debug=False): v_dfs = [] try: for page_index in range(pages): h_df = h_dfs[page_index] v_df = merge_vertical_blocks(h_df, document_configs, debug=False) v_dfs.append(v_df) except Exception as e: log_error("Service get_xml", "Error in creating v_dfs", None, e) return v_dfs
def mask_image(image, regions , image_width,image_height,input_json,margin= 0 ,fill=255): for table in regions: try : row_top, row_bottom,row_left,row_right = end_point_correction(table, 2,image_height,image_width) if len(image.shape) == 2 : image[row_top - margin : row_bottom + margin , row_left - margin: row_right + margin] = fill if len(image.shape) == 3 : image[row_top - margin: row_bottom + margin, row_left - margin: row_right + margin,:] = fill except Exception as e : log_error("Service TableExtractor Error in masking bg image" +str(e), input_json, e) return image return image
def block_translate(): service = BlockTranslationService() validator = TranslatorValidator() data = request.get_json() error = validator.validate_block_translate(data) if error is not None: log_error("Error in Block Translate: " + str(error), data, None) log_error("Input: " + str(data), data, None) data["state"], data["status"], data["error"] = "TRANSLATED", "FAILED", error return data, 400 response = service.block_translate(data) return jsonify(response), 200
def post(self): log_info("Resource Pdf2HtmlConversion", "pdf2html service started", None) json_data = request.get_json(force=True) try: error_validator = ValidationResponse(DOWNLOAD_FOLDER) if error_validator.format_error(json_data) is True: response_gen = Response(json_data, DOWNLOAD_FOLDER) response = response_gen.nonwf_response() log_info("Resource Pdf2HtmlConversion", "pdf2html api response completed", None) return jsonify(response) except FormatError as e: log_error("Resource Pdf2HtmlConversion", "Input json format is not correct or dict_key is missing", None, e) return Status.ERR_request_input_format.value
def create_pdf_processing_paths(filepath, base_dir): filename = os.path.basename(filepath) working_dir = os.path.join(base_dir, os.path.splitext(filename)[0] + '_' + str(uuid.uuid1())) ret = create_directory(working_dir) if ret == False: log_error('unable to create working directory {}'.format(working_dir), app_context.application_context, None) return None, False log_info('created processing directories successfully {}'.format(working_dir), app_context.application_context) return working_dir, True
def start_kafka(): try: t1 = threading.Thread(target=process_vision_ocr_kf, name='vision_ocr-consumer-thread') t1.start() log_info("multithread Kafka running on multithread", LOG_WITHOUT_CONTEXT) t2 = threading.Thread(target=vision_ocr_request_worker, name='vision_ocr-worker-thread') t2.start() log_info("Starting vision_ocr_request_worker", LOG_WITHOUT_CONTEXT) except Exception as e: log_error("threading ERROR WHILE RUNNING CUSTOM THREADS ", LOG_WITHOUT_CONTEXT, e)
def start_kafka(): try: t1 = threading.Thread(target=process_block_merger_kf, name='BM-consumer-thread') t1.start() log_info("multithread Kafka running on multithread", None) t2 = threading.Thread(target=block_merger_request_worker, name='BM-worker-thread') t2.start() log_info("Starting block_merger_request_worker", None) except Exception as e: log_error("threading ERROR WHILE RUNNING CUSTOM THREADS ", None, e)