def consume(): try: topics = [anu_translator_input_topic, anu_translator_nonmt_topic] consumer = instantiate(topics) service = TranslatorService() validator = TranslatorValidator() rand_str = ''.join( random.choice(string.ascii_letters) for i in range(4)) prefix = "Translator-Core-" + "(" + rand_str + ")" log_info(prefix + " Running..........", None) while True: for msg in consumer: data = {} try: data = msg.value if data: if msg.topic == anu_translator_nonmt_topic: service.process_no_nmt_jobs(data) else: log_info( prefix + " | Received on Topic: " + msg.topic + " | Partition: " + str(msg.partition), data) error = validator.validate_wf(data, False) if error is not None: log_error(prefix + " | Error: " + str(error), data, error) log_info(prefix + " | Input: " + str(data), data) post_error_wf(error["code"], error["message"], data, None) break service.start_file_translation(data) else: break except Exception as e: log_exception( prefix + " Exception in translator while consuming: " + str(e), data, e) post_error( "TRANSLATOR_CONSUMER_ERROR", "Exception in translator while consuming: " + str(e), None) except Exception as e: log_exception( "Exception while starting the translator consumer: " + str(e), None, e) post_error("TRANSLATOR_CONSUMER_EXC", "Exception while starting translator consumer: " + str(e), None)
def error_handler(self, code, message, object_in, iswf): if iswf: object_in["state"] = "SENTENCES-ALIGNED" object_in["status"] = "FAILED" error = post_error_wf(code, message, object_in, None) else: error = post_error(code, message, None) return error
def find_footer( xml_dfs, page_height, preprocess_config, ): pdf_level = [] try: page_df = xml_dfs[0] except Exception as e: post_error_wf(400, "invalid xml_df passed for preprocessing ", app_context.application_context, e) return None sub_df = cut_page(page_df, page_height, cut_at=preprocess_config['footer_cut'], direction='below') sub_df = add_box_coordinates(sub_df) margin = preprocess_config['margin'] for page2_df in xml_dfs: s_df = cut_page(page2_df, page_height, cut_at=preprocess_config['footer_cut'], direction='below') s_df = add_box_coordinates(s_df) page_level = [] for index1, row1 in sub_df.iterrows(): iou = 0 sub_s_df = s_df[(s_df['text_top'] > row1['text_top'] - margin) & ( s_df['text_bottom'] < row1['text_bottom'] + margin)] if len(sub_df) > 0: for index2, row2 in sub_s_df.iterrows(): iou += bb_intersection_over_union(row1, row2) page_level.append(iou) pdf_level.append(page_level) iou_df = pd.DataFrame(pdf_level, columns=sub_df['text'].values) check_repeation = iou_df.sum() / len(iou_df) regions_to_remove = sub_df[list( check_repeation > preprocess_config['repeat_threshold'])] return regions_to_remove
def error_handler(self, object_in, code, iswf): if iswf: object_in['status'] = "FAILED" object_in['state'] = "SENTENCE-TOKENISED" error = post_error_wf(code, object_in['message'], object_in, None) return error else: code = code message = "" error = post_error(code, message, None) return error
def error_handler(self, object_in, code, iswf): if iswf: object_in['status'] = "FAILED" object_in['state'] = config.TASK_STAT error = post_error_wf(code, object_in['message'], object_in, None) return error else: code = code message = "" error = post_error(code, message, None) return error
def initiate_wf(self, wf_input): try: order_of_execution = wfmutils.get_order_of_exc( wf_input["workflowCode"]) first_step_details = order_of_execution[0] first_tool = first_step_details["tool"][0] input_topic = first_tool["kafka-input"][0]["topic"] first_tool_input = wfmutils.get_tool_input_async( first_tool["name"], None, None, wf_input) if first_tool_input is None: error = validator.get_error( "INCOMPATIBLE_TOOL_SEQUENCE", "The workflow contains incompatible steps.") client_output = self.get_wf_details_async( wf_input, None, True, error) self.update_job_details(client_output, False) log_error("The workflow contains incompatible steps.", wf_input, None) return None producer.push_to_queue(first_tool_input, input_topic) client_output = self.get_wf_details_async(wf_input, None, False, None) self.update_job_details(client_output, False) wf_input["metadata"][ "module"] = module_wfm_name # FOR LOGGING ONLY. log_info( "Workflow: " + wf_input["workflowCode"] + " initiated for the job: " + wf_input["jobID"], wf_input) log_info( first_tool["name"] + log_msg_start + " jobID: " + wf_input["jobID"], wf_input) except Exception as e: log_exception( "Exception while initiating ASYNC workflow: " + str(e), wf_input, e) post_error_wf("WFLOW_INITIATE_ERROR", "Exception while initiating workflow: " + str(e), wf_input, e)
def process_incoming_request(app_context, request_params, jobId, workflowId): try: log_info( 'request received for jobId %s, workflowId %s' % (jobId, workflowId), app_context.application_context) log_info(request_params, app_context.application_context) validity = Datautils.validate_annotation_input( request_params['sourceLanguage'], request_params['targetLanguage'], jobId, request_params['annotationType'], request_params['users'], request_params['fileInfo']) if validity is not None: LOG_WITHOUT_CONTEXT['jobID'] = jobId log_info( 'Missing params in annotation task creation | requestparams:{}' .format(str(request_params)), LOG_WITHOUT_CONTEXT) post_error_wf( 'TASK_CREATION_FAILED', 'Annotation task creation failed due to missing params', LOG_WITHOUT_CONTEXT, None) return None create_task = parallelSentenceAnnotationRepo.store(request_params['sourceLanguage'], request_params['targetLanguage'], \ jobId, request_params['annotationType'], request_params['users'], request_params['fileInfo'], request_params['description']) if create_task == False: LOG_WITHOUT_CONTEXT['jobID'] = jobId log_info('Annotation task creation failed due to file error', LOG_WITHOUT_CONTEXT) post_error_wf('TASK_CREATION_FAILED', 'Annotation task creation failed due to file error', LOG_WITHOUT_CONTEXT, None) return None except Exception as e: log_exception("Exception : ", app_context.application_context, e) return None return app_context.application_context
def error_handler(self, object_in, code, iswf): if iswf: job_id = object_in["jobID"] task_id = object_in["taskID"] state = object_in['state'] status = object_in['status'] code = code message = object_in['message'] error = post_error_wf(code, message, object_in, None) return error else: code = object_in['error']['code'] message = object_in['error']['message'] error = post_error(code, message, None) return error
def post(self): body = request.get_json() if 'annotationType' not in body.keys() or 'sourceLanguage' not in body.keys() or \ 'targetLanguage' not in body.keys() or 'fileInfo' not in body.keys() or \ 'users' not in body.keys() or 'description' not in body.keys(): LOG_WITHOUT_CONTEXT['jobID'] = body['jobId'] log_info( 'Missing params in ParallelSentenceTaskCreateResource {}'. format(body), LOG_WITHOUT_CONTEXT) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) post_error_wf( "TASK_CREATION_FAILED", "Annotation task creation failed due to missing params", LOG_WITHOUT_CONTEXT, None) return res.getresjson(), 400 validity = Datautils.validate_annotation_input( body['sourceLanguage'], body['targetLanguage'], body['jobId'], body['annotationType'], body['users'], body['fileInfo']) if validity is not None: LOG_WITHOUT_CONTEXT['jobID'] = body['jobId'] log_info( 'Missing params in ParallelSentenceTaskCreateResource {}'. format(body), LOG_WITHOUT_CONTEXT) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) post_error_wf( "TASK_CREATION_FAILED", "Annotation task creation failed due to missing params", LOG_WITHOUT_CONTEXT, None) return res.getresjson(), 400 log_info( 'Received annotation task creation request | ParallelSentenceTaskCreateResource: {}' .format(body), LOG_WITHOUT_CONTEXT) try: result = parallelSentenceAnnotationRepo.store(body['sourceLanguage'], body['targetLanguage'], \ body['jobId'], body['annotationType'], body['users'], body['fileInfo'], body['description']) if result == False: LOG_WITHOUT_CONTEXT['jobID'] = body['jobId'] log_info( 'Missing params in ParallelSentenceTaskCreateResource {}'. format(body), LOG_WITHOUT_CONTEXT) res = CustomResponse( Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) post_error_wf( "TASK_CREATION_FAILED", "Annotation task creation failed due to file error", LOG_WITHOUT_CONTEXT, None) return res.getresjson(), 400 else: res = CustomResponse(Status.SUCCESS.value, None) return res.getres() except Exception as e: log_exception("Exception at ParallelSentenceTaskCreateResource ", LOG_WITHOUT_CONTEXT, e) post_error_wf( "TASK_CREATION_FAILED", "Annotation task creation failed due to missing params", LOG_WITHOUT_CONTEXT, None) res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None) return res.getresjson(), 400
def workflow_response(self, task_id, task_starttime, debug_flush=False): app_context.init() app_context.application_context = {} input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(self.json_data) log_info("workflow_response started the response generation", app_context.application_context) error_validator = ValidationResponse(self.DOWNLOAD_FOLDER) try: error_validator.wf_keyerror(jobid, workflow_id, tool_name, step_order) error_validator.inputfile_list_error(input_files) output_file_response = list() for i, item in enumerate(input_files): input_filename, in_file_type, in_locale = file_ops.accessing_files(item) self.json_data['taskID'] = task_id app_context.application_context = self.json_data if debug_flush == False: bm_response = DocumentStructure(app_context=app_context, file_name=input_filename, lang=in_locale) if bm_response['code'] == 200: output_filename_json = file_ops.writing_json_file(i, bm_response['rsp'], self.DOWNLOAD_FOLDER) file_res = file_ops.one_filename_response(input_filename, output_filename_json, in_locale, in_file_type) output_file_response.append(file_res) task_endtime = str(time.time()).replace('.', '') response_true = CustomResponse(Status.SUCCESS.value, jobid, task_id) response_success = response_true.success_response(workflow_id, task_starttime, task_endtime, tool_name, step_order, output_file_response) response = copy.deepcopy(response_success) log_info("successfully generated response for workflow", app_context.application_context) return response else: post_error_wf(bm_response.code, bm_response.message, app_context.application_context, None) return None else: log_info('flushing queue data, not handling file {}'.format(input_files), app_context.application_context) post_error_wf(400, 'flushing queue data, not handling file {}'.format(input_files), app_context.application_context, None) return None except WorkflowkeyError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = str(e) response = file_ops.error_handler(response_custom.status_code, "WORKFLOWKEY-ERROR", True) log_exception("workflow_response workflow key error: key value missing", app_context.application_context, e) response = copy.deepcopy(response) return response except FileErrors as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = e.message response = file_ops.error_handler(response_custom.status_code, e.code, True) log_exception("workflow_response some error occured while validating file", app_context.application_context, e) response = copy.deepcopy(response) return response except ServiceError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, jobid, task_id) response_custom.status_code['message'] = str(e) response = file_ops.error_handler(response_custom.status_code, "SERVICE_ERROR", True) log_exception("workflow_response Something went wrong during pdf to block conversion.", app_context.application_context, e) response = copy.deepcopy(response) return response
def process_api_translations(self, api_response, tmx_phrase_dict, no_nmt, translate_wf_input): api_res_translations, record_id, batch_id, skip_count, trans_count = [], None, None, 0, 0 try: for translation in api_response["data"]: if type(translation) == "str": translation = json.loads(translation) if "s_id" not in translation.keys(): log_error( "S_ID missing for SRC: {}".format(translation["src"]), translate_wf_input, None) continue translation_obj = { "src": translation["src"], "n_id": translation["s_id"].split("xxx")[0], "batch_id": translation["s_id"].split("xxx")[1], "s_id": translation["s_id"].split("xxx")[2] } if not no_nmt: if 'tgt' not in translation.keys(): log_error( "TGT missing for SRC: {}".format( translation["src"]), translate_wf_input, None) else: translation_obj["tgt"] = translation["tgt"] if 'tmx_phrases' not in translation.keys(): translation_obj["tmx_phrases"] = [] if tmx_phrase_dict: tmx = tmx_phrase_dict[translation["s_id"]] translation_obj["tmx_phrases"] = tmx if tmx else [] else: translation_obj["tmx_phrases"] = translation["tmx_phrases"] record_id = translation_obj["n_id"].split( "|")[0] + "|" + translation_obj["n_id"].split("|")[1] batch_id = translation_obj["batch_id"] api_res_translations.append(translation_obj) file = self.get_content_from_db(record_id, None, None, translate_wf_input) if not file: log_error( "There is no data for this recordID: " + str(record_id), translate_wf_input, None) post_error_wf( "TRANSLATION_FAILED", "There is no data for this recordID: " + str(record_id), translate_wf_input, None) return False try: if no_nmt: self.process_no_nmt_jobs({ "recordID": record_id, "message": api_res_translations }) trans_count += len(api_res_translations) else: self.update_sentences(record_id, api_res_translations, translate_wf_input) trans_count += len(api_res_translations) except Exception as e: log_exception( "Exception while saving translations to DB: " + str(e), translate_wf_input, e) skip_count += len(api_res_translations) self.update_translation_status(batch_id, trans_count, skip_count, translate_wf_input) return True except Exception as e: log_exception( "Exception while processing the API output -- {}".format(e), translate_wf_input, e) post_error_wf( "TRANSLATION_ERROR", "Exception while processing the API output -- {}".format(e), translate_wf_input, e) return False
def process_translation(self, nmt_output): try: record_id = nmt_output["record_id"] recordid_split = str(record_id).split("|") job_id, file_id, batch_size = recordid_split[0], recordid_split[ 1], eval(recordid_split[2]) record_id = str(job_id) + "|" + str(file_id) translate_wf_input = { "jobID": job_id, "metadata": { "module": tool_translator } } file = self.get_content_from_db(record_id, None, None, translate_wf_input) if not file: log_error( "There is no data for this recordID: " + str(record_id), translate_wf_input, nmt_output["status"]) post_error_wf( "TRANSLATION_FAILED", "There is no data for this recordID: " + str(record_id), translate_wf_input, None) return file, skip_count, trans_count, batch_id = file[0], 0, 0, None translate_wf_input = file["transInput"] translate_wf_input["recordID"] = record_id if 'status' in nmt_output.keys(): if nmt_output["status"]["statusCode"] != 200: skip_count += batch_size log_error( "Error from NMT: " + str(nmt_output["status"]["message"]), translate_wf_input, nmt_output["status"]) if 'data' in nmt_output.keys(): if not nmt_output["data"]: log_error("NMT returned empty data[]!", translate_wf_input, None) skip_count += batch_size sentences_of_the_batch = [] for response in nmt_output["data"]: if "n_id" not in response.keys(): log_error( "Node ID missing! s_id: {}, b_id: {}".format( response["s_id"], batch_id), translate_wf_input, None) skip_count += 1 continue batch_id = response["batch_id"] if 'tgt' not in response.keys(): log_info( "TGT missing! s_id: {}, b_id: {}".format( response["s_id"], batch_id), translate_wf_input) sentences_of_the_batch.append(response) if len(sentences_of_the_batch) == 0: skip_count += batch_size log_error("NMT returned empty response_body!", translate_wf_input, None) else: try: self.update_sentences(record_id, sentences_of_the_batch, translate_wf_input) trans_count += len(sentences_of_the_batch) except Exception as e: log_exception( "Exception while saving translations to DB: " + str(e), translate_wf_input, e) skip_count += len(sentences_of_the_batch) self.update_translation_status(batch_id, trans_count, skip_count, translate_wf_input) return except Exception as e: log_exception("Exception while processing NMT output: " + str(e), None, e) return
def page_processor_via_api(self, page, record_id, file, tmx_present, nonmt_user, tmx_file_cache, translate_wf_input): batches, pw_dict, bw_data = self.fetch_batches_of_sentences( file, record_id, page, tmx_present, tmx_file_cache, True, translate_wf_input) batches_count, sentences_count, tmx_count = 0, 0, 0 if not batches: log_error("No batches obtained for page: " + str(page["page_no"]), translate_wf_input, None) return batches_count, sentences_count, tmx_count batches_count, tmx_count = len(batches), pw_dict["tmx_count"] for batch_id in batches.keys(): tmx_phrase_dict = {} batch = batches[batch_id] for sentence in batch: tmx_phrase_dict[sentence["s_id"]] = sentence["tmx_phrases"] try: nmt_in = { "src_list": batch, "source_language_code": file["model"]["source_language_code"], "target_language_code": file["model"]["target_language_code"] } if nonmt_user: nmt_in = {"data": batch} processed = self.process_api_translations( nmt_in, None, nonmt_user, translate_wf_input) if not processed: return None else: api_host = os.environ.get( file["model"]["connection_details"]["translation"] ["host"], "NA") api_ep = os.environ.get( file["model"]["connection_details"]["translation"] ["api_endpoint"], "NA") if api_host == "NA" or api_ep == "NA": log_error("No API URL found!", translate_wf_input, None) post_error_wf("API_ERROR", "No API URL found!", translate_wf_input, None) break url = str(api_host) + str(api_ep) response = utils.call_api(url, "POST", nmt_in, None, "userID") if response["data"]: log_info( "B_ID: " + batch_id + " | SENTENCES: " + str(len(batch)) + " | COMPUTED: " + str(bw_data[batch_id]["computed"]) + " | TMX: " + str(bw_data[batch_id]["tmx_count"]), translate_wf_input) processed = self.process_api_translations( response, tmx_phrase_dict, nonmt_user, translate_wf_input) if not processed: return None else: log_error("Empty response from API -- {}".format(url), translate_wf_input, None) post_error_wf( "API_ERROR", "Empty response from API -- {}".format(url), translate_wf_input, None) break except Exception as e: log_exception( "Exception while while translating via API -- {}".format( e), translate_wf_input, e) post_error_wf( "TRANSLATION_ERROR", "Exception while while translating via API -- {}".format( e), translate_wf_input, e) break sentences_count += len(batch) return batches_count, sentences_count, tmx_count
def push_sentences_to_nmt(self, file, translate_wf_input): try: log_info( "File translation started... " + str(translate_wf_input["jobID"]), translate_wf_input) record_id = str(translate_wf_input["jobID"]) + "|" + str( file["path"]) content_from_db = self.get_content_from_db(record_id, None, None, translate_wf_input) if not content_from_db: log_exception( "File content from DB couldn't be fetched, jobID: " + str(translate_wf_input["jobID"]), translate_wf_input, None) post_error_wf( "TRANSLATION_FAILED", "File content from DB couldn't be fetched, jobID: " + str(translate_wf_input["jobID"]), translate_wf_input, None) return pages = repo.fetch_pages({"record_id": record_id}) total_sentences, total_tmx, total_batches, tmx_file_cache = 0, 0, 0, {} tmx_file_cache = {} tmx_present, nonmt_user = \ self.get_rbac_tmx_utm(translate_wf_input["metadata"]["roles"], translate_wf_input, True)[0], False if tmx_present: tmx_present = self.is_tmx_present(file, translate_wf_input) if translate_wf_input["metadata"]["orgID"] in list( str(orgs_nmt_disable).split(",")): log_info("Job belongs to NONMT type!", translate_wf_input) tmx_present, nonmt_user = False, True pool = multiprocessing.Pool(no_of_process) connection_details = file["model"]["connection_details"] if connection_details["kafka"]: log_info("Translating via Kafka....", translate_wf_input) func = partial(self.page_processor, record_id=record_id, file=file, tmx_present=tmx_present, nonmt_user=nonmt_user, tmx_file_cache=tmx_file_cache, translate_wf_input=translate_wf_input) page_processors = pool.map_async(func, pages).get() for page_result in page_processors: total_batches += page_result[0] total_sentences += page_result[1] total_tmx += page_result[2] else: log_info("Translating via third-party API....", translate_wf_input) for page in pages: page_result = self.page_processor_via_api( page, record_id, file, tmx_present, nonmt_user, tmx_file_cache, translate_wf_input) if not page_result: break total_batches += page_result[0] total_sentences += page_result[1] total_tmx += page_result[2] if total_sentences > 0: repo.update( { "totalSentences": total_sentences, "batches": total_batches }, {"recordID": record_id}) log_info( "recordID: " + record_id + " | PAGES: " + str(len(pages)) + " | BATCHES: " + str(total_batches) + " | SENTENCES: " + str(total_sentences) + " | TMX: " + str(total_tmx), translate_wf_input) else: repo.update({ "totalSentences": 0, "batches": 0 }, {"recordID": record_id}) log_exception( "No sentences sent to NMT, recordID: " + record_id, translate_wf_input, None) except Exception as e: log_exception( "Exception while pushing sentences to NMT: " + str(e), translate_wf_input, e) post_error_wf( "TRANSLATION_FAILED", "Exception while pushing sentences to NMT: " + str(e), translate_wf_input, e)
def manage_wf(self, task_output): try: job_id = task_output["jobID"] job_details = wfmutils.get_job_details(job_id) if not job_details: log_error( "This job is not found in the system, jobID: " + job_id, task_output, None) return None log_info( task_output["tool"] + log_msg_end + " jobID: " + task_output["jobID"], task_output) job_details = job_details[0] if job_details["status"] == "FAILED" or job_details[ "status"] == "COMPLETED" or job_details[ "status"] == "INTERRUPTED": log_error( "The job is already completed/failed/interrupted, jobID: " + job_id, task_output, None) return None if task_output["status"] != "FAILED": next_step_details = self.get_next_step_details(task_output) if next_step_details is not None: if next_step_details == "EXC": log_error("Job FAILED: " + task_output["jobID"], task_output, None) post_error_wf( "NEXT_STEP_EXCEPTION", "There was an error while fetching the next step for this wf", task_output, None) return None client_output = self.get_wf_details_async( None, task_output, False, None) self.update_job_details(client_output, False) next_step_input = next_step_details[0] if next_step_input is None: log_error( "The workflow contains incompatible steps in sequence. Please check the wf config.", task_output, None) post_error_wf( "INCOMPATIBLE_TOOL_SEQUENCE", "The wf contains incompatible steps in sequence. Please check the wf config.", task_output, None) return None next_tool = next_step_details[1] step_completed = task_output["stepOrder"] next_step_input["stepOrder"] = step_completed + 1 producer.push_to_queue( next_step_input, next_tool["kafka-input"][0]["topic"]) log_info( next_tool["name"] + log_msg_start + " jobID: " + task_output["jobID"], task_output) else: client_output = self.get_wf_details_async( None, task_output, True, None) self.update_job_details(client_output, False) log_info("Job COMPLETED: " + task_output["jobID"], task_output) else: # Safety else block, in case module fails to push data to error topic log_error("Job FAILED: " + task_output["jobID"], task_output, None) client_output = self.get_wf_details_async( None, task_output, False, task_output["error"]) self.update_job_details(client_output, False) #self.push_to_notifier(task_output) except Exception as e: log_exception( "Exception while managing the ASYNC workflow: " + str(e), task_output, e) post_error_wf("WFLOW_MANAGE_ERROR", "Exception while managing workflow: " + str(e), task_output, e)