Example #1
0
def get_pdfs(page_dfs,lang):
    start_time          = time.time()
    try:
        p_dfs    = []
        pages    = len(page_dfs)
        block_configs = config.BLOCK_CONFIGS
        for page_index in range(pages):
            page_df     = page_dfs[page_index]
            cols        = page_df.columns.values.tolist()
            df          = pd.DataFrame(columns=cols)
            for index, row in page_df.iterrows():
                if row['children'] == None:
                    d_tmp = page_df.iloc[index]
                    d_tmp['avg_line_height'] = int(d_tmp['text_height'])
                    df = df.append(d_tmp)
                else:
                    dfs = process_block(page_df.iloc[index], block_configs,lang)
                    df  = df.append(dfs)
            p_dfs.append(df)

    except Exception as e :
        log_error('Error in creating p_dfs', app_context.application_context, e)
        return None

    end_time         = time.time()
    elapsed_time     = end_time - start_time
    log_info('Processing of get_pdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(p_dfs), (elapsed_time/len(p_dfs))), app_context.application_context)
    return p_dfs
Example #2
0
    def post(self):
        body = request.get_json()

        if 'annotationId' not in body.keys() or 'score' not in body.keys() or \
            'saved' not  in body.keys():
            log_info(
                'Missing params in AnnotationTaskSaveAnnotationResource {}'.
                format(body), LOG_WITHOUT_CONTEXT)
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        try:
            result = parallelSentenceAnnotationRepo.save_annotation(body)
            if result == None:
                res = CustomResponse(
                    Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
                return res.getres()
            else:
                res = CustomResponse(Status.SUCCESS.value, result)
                return res.getres()
        except Exception as e:
            log_exception("Exception at AnnotationTaskSaveAnnotationResource ",
                          LOG_WITHOUT_CONTEXT, e)
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400
Example #3
0
def extract_pdf_metadata(filename, working_dir, base_dir, jobid):
    start_time = time.time()
    pdf_filepath = Path(os.path.join(base_dir, filename))
    try:
        pdf_image_paths = extract_image_paths_from_pdf(pdf_filepath,
                                                       working_dir)
        pdf_xml_dir = extract_xml_from_digital_pdf(pdf_filepath, working_dir)
    except Exception as e:
        log_error("Service xml_utils", "Error in extracting xml", jobid, e)
    try:
        os.system('pdftohtml -c ' + str(pdf_filepath) + ' ' +
                  str(working_dir) + '/')
    except Exception as e:
        log_error("Service get_xml", "Error in extracting html", jobid, e)

    # try:
    #     pdf_bg_image_dir    = extract_html_bg_images_from_digital_pdf(pdf_filepath, working_dir)
    # except Exception as e :
    #     log_error("Service xml_utils", "Error in extracting html of bg images", jobid, e)
    #
    end_time = time.time()
    extraction_time = end_time - start_time

    xml_files = read_directory_files(pdf_xml_dir, pattern='*.xml')
    bg_files = None  #read_directory_files(pdf_bg_image_dir, pattern='*.png')

    log_info('Service get_xml',
             'Successfully extracted xml, background images of file:', jobid)

    return xml_files, bg_files, pdf_image_paths
Example #4
0
def replace_tags_with_original(text,date_original,url_dict,num_array,num_map):
  '''
  Replaces dates,urls and numbers in the text with the original values
  in place of the tags
  '''
  try:
    resultant_str = list()
      
    if len(text) == 0:
      return ""
    for word in text.split():
      if 'UuRrLl' in word:
        word = url_dict[word]         

      resultant_str.append(word)
      s = [str(i) for i in resultant_str] 
      res = str(" ".join(s))

    log_info("response after url and date replacemnt:{}".format(res),MODULE_CONTEXT)    
    
    if len(num_map) == 0:
      ''' handling the case when model outputs a tag which is not in tagged_src(src is without any number'''
      for char in reversed(hindi_numbers):  
        res = re.sub(r'NnUuMm'+char,"",res)
    num_map.reverse()
    for item in num_map:
      res = res.replace(item['tag'],str(item['no.']),1)
   
    res = remove_extra_tags(res)     
    log_info("response after tags replacement:{}".format(res),MODULE_CONTEXT)
    return res    
  except Exception as e:
    log_exception("Error in parent except block of replace_tags_with_original_1 function, returning tagged output:{}".format(e),MODULE_CONTEXT,e)
    return res
Example #5
0
def start_kafka():
    try:
        t1 = threading.Thread(target=process_fc_kf, name='keep_on_running')
        t1.start()
        log_info("multithread : Kafka running on multithread", None)
    except Exception as e:
        log_error("multithread : Error while running custom threads", None, e)
Example #6
0
 def consume(self):
     topics = [anu_dp_wf_aligner_in_topic]
     consumer = self.instantiate(topics)
     service = AlignmentService()
     util = AlignmentUtils()
     rand_str = ''.join(
         random.choice(string.ascii_letters) for i in range(4))
     prefix = "Align-WFM-Consumer(" + rand_str + ")"
     log_info(prefix + " running.......", None)
     while True:
         #thread_count = 0
         for msg in consumer:
             data = {}
             try:
                 data = msg.value
                 if data:
                     log_info(
                         prefix + " | Received on Topic: " + msg.topic +
                         " | Partition: " + str(msg.partition), data)
                     service.wf_process(data)
                 break
             except Exception as e:
                 log_exception("Exception while consuming: " + str(e), data,
                               e)
                 util.error_handler("ALIGNER_CONSUMER_ERROR",
                                    "Exception while consuming", data, True)
                 break
Example #7
0
 def push_data_to_queue(self, topic_name, push_data):
     producer = self.producer_fn()
     producer.send(topic_name, value=push_data)
     producer.flush()
     log_info(
         "push_data_to_queue : successfully pushed data to output queue",
         None)
Example #8
0
 def get_nmt_url_body(self, block_translate_input, nmt_txt):
     model = block_translate_input["input"]["model"]
     nmt_in = {
         "src_list": nmt_txt,
         "source_language_code": model["source_language_code"],
         "target_language_code": model["target_language_code"],
         "model_id": model["model_id"]
     }
     try:
         host = model["connection_details"]["translation"]["host"]
         api_host = os.environ.get(host, 'NA')
         endpoint = model["connection_details"]["translation"][
             "api_endpoint"]
         api_endpoint = os.environ.get(endpoint, 'NA')
         if api_host == "NA" or api_endpoint == "NA":
             log_info("Falling back to Anuvaad NMT translate URL....",
                      block_translate_input)
             return nmt_translate_url, nmt_in
         url = api_host + api_endpoint
         return url, nmt_in
     except Exception as e:
         log_exception(
             "Exception while fetching API conn details: {}".format(str(e)),
             block_translate_input, None)
     log_info("Falling back to Anuvaad NMT translate URL....",
              block_translate_input)
     return nmt_translate_url, nmt_in
Example #9
0
def process_tokenization_kf():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder)
    # instatiation of consumer for respective topic
    try:
        consumer_class = Consumer(config.input_topic, config.bootstrap_server)
        consumer = consumer_class.consumer_instantiate()
        log_info("process_tokenization_kf : trying to receive value from consumer ", None)
        for msg in consumer:
            data = msg.value
            log_info("process_tokenization_kf : received input json from input topic consumer ", data)
            task_id = str("TOK-" + str(time.time()).replace('.', '')[0:13])
            task_starttime = eval(str(time.time()).replace('.', '')[0:13])
            input_files, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(data)
            response_gen = Response(data, DOWNLOAD_FOLDER)
            file_value_response = response_gen.workflow_response(task_id, task_starttime)
            if "errorID" not in file_value_response.keys():
                producer = Producer()
                producer.push_data_to_queue(config.output_topic, file_value_response, data, task_id)
            else:
                log_error("process_tokenization_kf : error send to error handler", data, None)
    except KafkaConsumerError as e:
        response_custom = CustomResponse(Status.ERR_STATUS.value, None, None)
        response_custom.status_code['message'] = str(e)
        file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True)
        log_exception("process_tokenization_kf : Consumer didn't instantiate", None, e)
    except KafkaProducerError as e:
        response_custom = e.code
        response_custom['message'] = e.message      
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception("process_tokenization_kf : response send to topic %s"%(config.output_topic), data, e)
Example #10
0
    def post(self):
        inputs = request.get_json(force=True)
        response_list = list()
        if len(inputs) > 0:
            log_info("Making labse-aligner(Resource) API call", MODULE_CONTEXT)
            log_info("Complete request input: {}".format(inputs),
                     MODULE_CONTEXT)
            try:
                for i in inputs:
                    if all(v in i for v in ["src_phrases", "tgt"]):
                        log_info("Making labse-aligner service call",
                                 MODULE_CONTEXT)
                        res = LabseAlignerService.phrase_aligner(i)
                        response_list.append(res)
                        out = CustomResponse(Status.SUCCESS.value,
                                             response_list)
                    else:
                        log_info(
                            "Missing mandatory Parameters for labse-aligner:src_phrases or tgt",
                            MODULE_CONTEXT)
                        out = CustomResponse(
                            Status.MANDATORY_PARAM_MISSING.value, [])
                        return out.getres()
            except Exception as e:
                status = Status.SYSTEM_ERR.value
                status['why'] = str(e)
                out = CustomResponse(status, [])

            return out.getres()
        else:
            log_info("null inputs in request in labse-aligner API",
                     MODULE_CONTEXT)
            out = CustomResponse(Status.INVALID_API_REQUEST.value, None)
            return out.getres()
Example #11
0
def vision_ocr_request_worker():
    file_ops            = FileOperation()
    DOWNLOAD_FOLDER     = file_ops.create_file_download_dir(config.download_folder)
    producer_tok        = Producer(config.bootstrap_server)
    log_info("vision_ocr_request_worker : starting thread ", LOG_WITHOUT_CONTEXT)

    while True:
        data            = processQueue.get(block=True)
        #################
        task_id         = str("vision_ocr" + str(time.time()).replace('.', ''))
        ###################
        task_starttime  = str(time.time()).replace('.', '')
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data)
        
        log_info("vision_ocr_request_worker processing -- received message "+str(jobid), data)

        try:
            response_gen    = Response(data, DOWNLOAD_FOLDER)

            file_value_response = response_gen.workflow_response(task_id, task_starttime, False)
            if file_value_response != None:
                if "errorID" not in file_value_response.keys():
                    push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id,data)
                    log_info("vision_ocr_request_worker : response send to topic %s"%(config.output_topic), LOG_WITHOUT_CONTEXT)
                else:
                    log_info("vision_ocr_request_worker : error send to error handler", data)

            log_info('vision_ocr_request_worker - request in internal queue {}'.format(Queue.qsize()), data)

            processQueue.task_done()
        except Exception as e:
            log_exception("vision_ocr_request_worker ",  LOG_WITHOUT_CONTEXT, e)

        controlQueue.put(1)
Example #12
0
 def consumer_instantiate(self):
     try:
         consumer = KafkaConsumer(self.topic_name, bootstrap_servers = list((self.server_address).split(",")), auto_offset_reset = 'latest', group_id = config.CONSUMER_GROUP, enable_auto_commit=True)
         log_info("consumer_instantiate : Consumer returned for topic: %s"%(self.topic_name), None)
         return consumer
     except Exception as e:
         log_exception("consumer_instantiate : error occured for consumer topic: %s"%(self.topic_name), None, e)
Example #13
0
def handle_sentences_wo_stop(language, sentence_array):
    '''
    Handles sentences in the array which do not have a sentence
    ending puncuation by adding it. Used in batch translation.
    '''
    try:
        if language is None:
            return sentence_array, []
        else:
            log_info("Inside handle_sentences_wo_stop", MODULE_CONTEXT)
            stop_puncs = misc.get_language_stop_puncs(language)
            full_stop_or_purnviram = stop_puncs[0]
            sent_indices_wo_stop = []
            for i, sentence in enumerate(sentence_array):
                if misc.is_sentence_wo_stop(sentence, stop_puncs):
                    sent_indices_wo_stop.append(i)
                    sentence_array[i] = misc.add_stop_punc(
                        sentence_array[i], full_stop_or_purnviram)

            return sentence_array, sent_indices_wo_stop

    except Exception as e:
        log_exception("Error in handle_sentences_wo_stop: {}".format(e),
                      MODULE_CONTEXT, e)
        return sentence_array, []
Example #14
0
def get_segmented_regions(app_context,base_dir) :
    try:
        files       = get_files(app_context.application_context)
        output      = []
        for index,file in enumerate(files):
            file   = get_json(base_dir, file['file']['name'])
            file_properties = File(file)
            pages = file_properties.get_pages()
            page_counts = len(pages)
            start_time = time.time()
            for page_index in range(page_counts):
                print('processing for page   :  ', page_index)
                # page_lines   =  file_properties.get_lines(page_index)
                # page_regions =  file_properties.get_regions(page_index)
                # page_words   =  file_properties.get_words(page_index)
                #font_meta    = font_properties(file_properties.get_page(page_index))
                font_meta  = []
                #page_regions =  region_unifier.region_unifier(page_lines,page_regions)
                #file_properties.set_regions(page_index, segment_regions(page_words,page_lines,page_regions))
                file_properties.set_font_properties(page_index,font_meta)

            output.append(file_properties.get_file())
            output[index]['status']= {'message':"block-segmenter successful"}
            end_time            = time.time()
            extraction_time     = (end_time - start_time)/page_counts
            log_info('block segmentation per page completed in {}'.format(extraction_time), app_context.application_context)
        app_context.application_context["outputs"] =output
        log_info("successfully completed block segmentation", None)
    except Exception as e:
        log_exception("Error occured during block segmentation ",  app_context.application_context, e)
        return None

    return app_context.application_context
Example #15
0
 def get_stored_hypothesis_ch(self, text_list, text_translate_input):
     sent_map, ch_res, text_for_nmt, ch_response = {}, {}, [], []
     for text in text_list:
         sent_map[text["s_id"]] = text
     api_input = {"sentences": list(sent_map.keys())}
     api_res = utils.call_api(sentence_fetch_url, "POST", api_input, None,
                              text_translate_input["metadata"]["userID"])
     if api_res:
         if api_res["data"]:
             ch_response = api_res["data"]
     if ch_response:
         for translation in ch_response:
             if translation["s_id"] in sent_map.keys():
                 tgt_list = []
                 if str(translation["tgt"]).startswith(
                         str(sent_map[translation["s_id"]]
                             ["taggedPrefix"])):
                     tgt_list.append(translation["tgt"])
                 if str(translation["s0_tgt"]).startswith(
                         str(sent_map[translation["s_id"]]
                             ["taggedPrefix"])):
                     tgt_list.append(translation["s0_tgt"])
                 if tgt_list:
                     translation["tgt"] = tgt_list
                     ch_res[translation["s_id"]] = translation
     for s_id in sent_map.keys():
         if s_id not in ch_res.keys():
             text_for_nmt.append(sent_map[s_id])
     log_info(
         "Translation fetched from CH! Count: " + str(len(ch_res.keys())),
         text_translate_input)
     return text_for_nmt, list(ch_res.values())
Example #16
0
def save_page_res(res, file_name):
    try:
        tmp_file = copy.deepcopy(res['rsp'])
        del tmp_file['input']
        tmp_file['files'] = res['rsp']['outputs']
        del tmp_file['outputs']
        json_file_name = file_name['output'][0]['outputFile']
        for file in [tmp_file]:
            recordID = file['jobID'] + '|' + json_file_name
            page_idx = 0
            total_pages = len(file['files'][0]['pages'])
            file['files'][0]['config'] = copy.deepcopy(
                file['files'][0]['config']['OCR'])
            save_file = copy.deepcopy(file)
            save_file['recordID'] = recordID
            while page_idx < total_pages:
                pages = file['files'][0]['pages'][page_idx:page_idx +
                                                  SAVE_NO_PAGE]
                save_file['files'][0]['pages'] = pages
                page_idx = page_idx + SAVE_NO_PAGE
                log_info(
                    "started saving data to database with record id: " +
                    str(recordID), app_context.application_context)
                rsp = requests.post(SAVE_URL, json=save_file)
                log_info(
                    "successfully saved data to database with record id: " +
                    str(recordID), app_context.application_context)
    except Exception as e:
        log_exception("Error occured during saving page response",
                      app_context.application_context, e)
Example #17
0
 def return_loaded_models(self, model_paths, ids):
     loaded_models = {}
     for i, path in enumerate(model_paths):
         translator = ctranslate2.Translator(path, device="auto")
         loaded_models[ids[i]] = translator
         log_info("Model Loaded: {}".format(ids[i]), MODULE_CONTEXT)
     return loaded_models
Example #18
0
def process_input(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        output_files = []
        langs = []
        for index, file in enumerate(files):
            file_output = {"status": {}}
            file = get_json(base_dir, file['file']['name'])[0]

            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)

            page_res = text_extraction(file_properties, page_paths, file)
            output_files.append(page_res)
            langs.append(file_properties.get_language())

        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None, None

    return app_context.application_context, langs
Example #19
0
 def call_api(self, uri, method, api_input, params, user_id):
     try:
         log_info("URI: " + uri, None)
         response = None
         if method == "POST":
             api_headers = {
                 'userid': user_id,
                 'x-user-id': user_id,
                 'Content-Type': 'application/json'
             }
             response = requests.post(url=uri,
                                      json=api_input,
                                      headers=api_headers)
         elif method == "GET":
             api_headers = {'userid': user_id}
             response = requests.get(url=uri,
                                     params=params,
                                     headers=api_headers)
         if response is not None:
             if response.text is not None:
                 log_info(response.text, None)
                 return json.loads(response.text)
             else:
                 log_error("API response was None, URI: " + str(uri),
                           api_input, None)
                 return None
         else:
             log_error("API call failed! URI: " + str(uri), api_input, None)
             return None
     except Exception as e:
         log_exception("Exception while making the api call: " + str(e),
                       api_input, e)
         return None
Example #20
0
def extract_images(app_context, base_dir):

    files = get_files(app_context.application_context)
    file_images = []
    try:
        for file in files:
            file_properties = File(file)
            file_format = file_properties.get_format()

            if file_format in ['PDF', 'pdf']:
                filename = file_properties.get_name()
                image_paths = extract_pdf_images(filename, base_dir)
                file_images.append(image_paths)
            else:
                if file_format in [
                        'PNG', 'JPEG', 'BMP', 'jpg', 'png', 'bmp', 'jpeg'
                ]:
                    filename = file_properties.get_name()
                    image_paths = [os.path.join(base_dir, filename)]
                    file_images.append(image_paths)
                else:
                    log_info(
                        "currently we do not support {} files .".format(
                            file_format), app_context.application_context)
                    return None
    except Exception as e:
        log_error('error extracting images of' + str(e),
                  app_context.application_context, e)
        return None

    return file_images
Example #21
0
    def post(self):
        body = request.get_json()

        if "keys" not in body or not body["keys"]:
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        keys = body["keys"]

        log_info("Fetching sentences from redis store",
                 AppContext.getContext())

        try:
            result = sentenceRepo.get_sentences_from_store(keys)
            if result == None:
                res = CustomResponse(
                    Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
                return res.getresjson(), 400

            res = CustomResponse(Status.SUCCESS.value, result)
            return res.getres()
        except Exception as e:
            log_exception(
                "Exception while fetching sentences from redis store ",
                AppContext.getContext(), e)
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400
 def create_translated_txt_file(self, record_id, dataframes, page_layout):
     try:
         out_translated_txt_filename = os.path.splitext(os.path.basename(record_id.split('|')[0]))[0] + str(uuid.uuid4()) + '_translated.txt'
         output_filepath_txt = os.path.join(self.DOWNLOAD_FOLDER , out_translated_txt_filename)
         out_txt_file_write = open(output_filepath_txt, 'w')
         page_width = page_layout['page_width']
         max_chars_in_line = int(page_width/13)
         for index, df in enumerate(dataframes):
             for index, row in df.iterrows():
                 if row['text'] != None and index+1 < df.shape[0]:
                     extra_spaces = int(row['text_left']/13.5)
                     write_str = re.sub(r'^', ' '*extra_spaces, row['text'])
                     if row['text_top'] != df.iloc[index+1]['text_top']:
                         if len(write_str) < max_chars_in_line:
                             out_txt_file_write.write("%s\n"%write_str)
                         else:
                             sub_string_list = self.break_large_sentence(write_str, max_chars_in_line)
                             for item in sub_string_list:
                                 out_txt_file_write.write("%s\n"%item)
                     else:
                         same_line_index = 0
                         same_line_status = bool(row['text_top'] == df.iloc[index+same_line_index+1]['text_top'])
                         while same_line_status:
                             onwards_line_space = int((df.iloc[index+same_line_index+1]['text_width'] - df.iloc[index]['text_left'] - df.iloc[index]['text_width'])/13.5)
                             write_str += ' '*onwards_line_space + df.iloc[index+same_line_index+1]['text']
                             same_line_index += 1
                             same_line_status = bool(row['text_top'] == df.iloc[index+same_line_index+1]['text_top'])
                         out_txt_file_write.write("%s\n"%write_str)
         out_txt_file_write.close()
         log_info("txt file write completed!! filename: %s"%out_translated_txt_filename, MODULE_CONTEXT)
         return out_translated_txt_filename
     except Exception as e:
         log_exception("txt file formation failed", MODULE_CONTEXT, e)
Example #23
0
    def post(self):
        body = request.json

        log_info('received request for WordSaveResource',
                 AppContext.getContext())
        if body == None:
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        if 'words' not in body:
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        for word in body['words']:
            if word['locale'] != 'en':
                res = CustomResponse(
                    Status.ERR_ENGLISH_MANDATORY_WHILE_SAVING.value, None)
                return res.getresjson(), 400

        result = wordRepo.store(body['words'])

        if result == False:
            res = CustomResponse(Status.ERR_SCHEMA_VALIDATION.value, None)
            return res.getresjson(), 400

        res = CustomResponse(Status.SUCCESS.value, None)
        return res.getres()
Example #24
0
 def run(self):
     obj = {"metadata": {"module": module_name}}
     rand_str = ''.join(random.choice(string.ascii_letters) for i in range(4))
     prefix = "TranslatorJobsCleaner(" + rand_str + ")"
     log_info(prefix + " -- AJM Deployed, TranslatorJobsCleaner running......", obj)
     translator_utils = TranslatorCronUtils()
     run = 0
     while not self.stopped.wait(eval(str(jc_cron_interval_sec))):
         try:
             records = translator_utils.find_all(False)
             deleted = 0
             for record in records:
                 try:
                     job_start_time = record["transInput"]["taskStartTime"]
                     diff = eval(str(time.time()).replace('.', '')[0:13]) - job_start_time
                     if (diff / 1000) > eval(str(jc_job_delete_interval_sec)):
                         translator_utils.delete(record["jobID"])
                         translator_utils.delete_batches(record["jobID"])
                         translator_utils.delete_pages(record["recordID"])
                         deleted += 1
                 except Exception as e:
                     log_exception(prefix + " -- Exception in JobsCleaner for record: " + record["recordID"], record["transInput"], e)
                     log_exception(prefix + " -- Exception - " + str(e), record["transInput"], e)
                     continue
             log_info(prefix + " -- Run: " + str(run) + " | Deleted: " + str(deleted), obj)
             run += 1
         except Exception as e:
             log_exception(prefix + " -- Run: " + str(run) + " | Exception: " + str(e), obj, e)
             run += 1
Example #25
0
 def nonwf_response(self):
     log_info("non workflow response started the response generation", app_context.application_context)
     input_files = self.json_data['files']
     error_validator = ValidationResponse(self.DOWNLOAD_FOLDER)
     try:
         error_validator.inputfile_list_empty(input_files)
         output_file_response = list()
         for item in input_files:
             input_filename, in_file_type, in_locale = file_ops.accessing_files(item)
             output_json_data = DocumentStructure(None, input_filename)
             output_filename_json = file_ops.writing_json_file(i, output_json_data, self.DOWNLOAD_FOLDER)
             file_res = file_ops.one_filename_response(input_filename, output_filename_json, in_locale, in_file_type)
             output_file_response.append(file_res)
         response_true = Status.SUCCESS.value
         response_true['output'] = output_file_response
         log_info("non workflow_response successfully generated response for rest server", app_context.application_context)
         response_true = copy.deepcopy(response_true)
         return response_true
     except FileErrors as e:
         response_custom = Status.ERR_STATUS.value
         response_custom['message'] = e.message
         response = file_ops.error_handler(response_custom, e.code, False)
         log_exception("non workflow_response some error occured while validating file", app_context.application_context, e)
         response = copy.deepcopy(response)
         return response
     except ServiceError as e:
         response_custom = Status.ERR_STATUS.value
         response_custom['message'] = str(e)
         response = file_ops.error_handler(response_custom, "SERVICE_ERROR", False)
         log_exception("non workflow_response Something went wrong during pdf to block conversion.", app_context.application_context, e)
         response = copy.deepcopy(response)
         return response
Example #26
0
 def post(self):
     json_data = request.get_json(force=True)
     app_context.init()
     app_context.application_context = json_data
     log_info(
         "Resource Layout_Detector_WF  Layout_Detector service started",
         app_context.application_context)
     task_id = str("LD-" + str(time.time()).replace('.', '')[0:13])
     task_starttime = eval(str(time.time()).replace('.', '')[0:13])
     #json_data = request.get_json(force = True)
     try:
         error_validator = ValidationResponse(DOWNLOAD_FOLDER)
         if error_validator.format_error(json_data) is True:
             response_gen = Response(json_data, DOWNLOAD_FOLDER)
             response = response_gen.workflow_response(
                 task_id, task_starttime)
             log_info(
                 "Resource Layout_Detector_WF Layout_Detector api response completed",
                 app_context.application_context)
             return jsonify(response)
     except FormatError as e:
         log_error(
             "Resource Layout_Detector_WF Input json format is not correct or dict_key is missing",
             app_context.application_context, e)
         return Status.ERR_request_input_format.value
Example #27
0
def process_info(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        file_images = []
        output = []
        for index, file_new in enumerate(files):
            start_time = time.time()
            file = get_json(file_new['file']['name'], base_dir)[0]
            file_properties = File(file)
            ocr_level, lang = get_ocr_config(file_new,
                                             file_properties.get_pages())
            file = preprocess_file(file_properties, lang, ocr_level)
            file['file'] = file_new['file']
            file['config'] = file_new['config']
            output.append(file)
            output[index]['status'] = {
                'code': 200,
                'message': "tesseract ocr successful"
            }
            end_time = time.time()
            extraction_time = (end_time - start_time) / len(
                file_properties.get_pages())
            log_info(
                'tesseract ocr per page completed in {}'.format(
                    extraction_time), app_context.application_context)
        app_context.application_context["outputs"] = output
        log_info("successfully completed tesseract ocr", None)
    except Exception as e:
        log_exception("Error occured during tesseract ocr ",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Example #28
0
 def get_nmt_url_body(self, text_translate_input, text_for_nmt):
     model = text_translate_input["input"]["model"]
     text_nmt = []
     for text in text_for_nmt:
         text_nmt.append({
             "s_id": text["s_id"],
             "id": model["model_id"],
             "src": text["src"],
             "target_prefix": text["taggedPrefix"]
         })
     try:
         host = model["connection_details"]["interactive"]["host"]
         api_host = os.environ.get(host, 'NA')
         endpoint = model["connection_details"]["interactive"][
             "api_endpoint"]
         api_endpoint = os.environ.get(endpoint, 'NA')
         if api_host == "NA" or api_endpoint == "NA":
             log_info("Falling back to Anuvaad NMT translate URL....",
                      text_translate_input)
             return nmt_it_url, text_nmt
         url = api_host + api_endpoint
         return url, text_nmt
     except Exception as e:
         log_exception(
             "Exception while fetching API conn details: {}".format(str(e)),
             text_translate_input, None)
     log_info("Falling back to Anuvaad NMT translate URL....",
              text_translate_input)
     return nmt_it_url, text_nmt
Example #29
0
def encode_translate_decode(input_sentence_array_prepd,sp_encoder,translator,sp_decoder,max_batch_size,batch_type,input_subwords_list,output_subwords_list,score_list):
    try:
        log_info("Inside encode_translate_decode function",MODULE_CONTEXT)
        start_encoding = time.time()
        input_subwords_list = [str(sp.encode_line(sp_encoder,sent)) for sent in input_sentence_array_prepd]
        input_final_array = [format_converter(input_subwords) for input_subwords in input_subwords_list]
        time_encoding = time.time() - start_encoding
        start_translating = time.time()
        m_out = translator.translate_batch(input_final_array,beam_size = 5,num_hypotheses=1,max_batch_size=max_batch_size,batch_type=batch_type)
        time_translating = time.time() - start_translating
        translation_array = [None] * len(output_subwords_list)
        start_decoding = time.time()
        for i, _ in enumerate(output_subwords_list):
                output_subwords_list[i] = " ".join(m_out[i][0]['tokens'])
                score_list[i] = m_out[i][0]['score']
                translation_array[i] = multiple_hypothesis_decoding(m_out[i],sp_decoder)[0]
        time_decoding = time.time() - start_decoding

        return translation_array, input_subwords_list, output_subwords_list, score_list, time_encoding, time_translating, \
            time_decoding

    except ServerModelError as e:
        log_exception("ServerModelError error in encode_translate_decode: {} and {}".format(e,sys.exc_info()[0]),MODULE_CONTEXT,e)
        raise
        
    except Exception as e:
        log_exception("Unexpexcted error in encode_translate_decode: {} and {}".format(e,sys.exc_info()[0]),MODULE_CONTEXT,e)
        raise
Example #30
0
def get_hdfs(in_dfs, header_region, footer_region):

    start_time          = time.time()
    try:
        pages = len(in_dfs)
        multiple_pages = False
        if pages > 1:
            multiple_pages =True
        h_dfs = []
        document_configs = config.DOCUMENT_CONFIGS
        for page_index in range(pages):
            page_df   = in_dfs[page_index]
            if multiple_pages :
                page_df   = tag_heaader_footer_attrib(header_region , footer_region,page_df)

            h_df    = merge_horizontal_blocks(page_df, document_configs, debug=False)
            h_dfs.append(h_df)
    except Exception as e :
        log_error('Error in creating h_dfs' +str(e), app_context.application_context, e)
        return None

    end_time         = time.time()
    elapsed_time     = end_time - start_time
    log_info('Processing of get_hdfs completed in {}/{}, average per page {}'.format(elapsed_time, len(in_dfs), (elapsed_time/len(in_dfs))), app_context.application_context)

    return h_dfs