Example #1
0
    def post(self):
        body = request.get_json()

        if "keys" not in body or not body["keys"]:
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        keys = body["keys"]

        log_info("Fetching sentences from redis store",
                 AppContext.getContext())

        try:
            result = sentenceRepo.get_sentences_from_store(keys)
            if result == None:
                res = CustomResponse(
                    Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
                return res.getresjson(), 400

            res = CustomResponse(Status.SUCCESS.value, result)
            return res.getres()
        except Exception as e:
            log_exception(
                "Exception while fetching sentences from redis store ",
                AppContext.getContext(), e)
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400
Example #2
0
    def get_sentence_by_s_id(self, user_id, s_id):
        try:
            collections = get_db()[DB_SCHEMA_NAME]
            docs = collections.aggregate([{
                '$match': {
                    'data.tokenized_sentences.s_id': s_id
                }
            }, {
                '$project': {
                    'tokenized_sentences': {
                        '$filter': {
                            'input': '$data.tokenized_sentences',
                            'as': 'ts',
                            'cond': {
                                '$eq': ['$$ts.s_id', s_id]
                            }
                        }
                    }
                }
            }])

            for doc in docs:
                sentence = doc['tokenized_sentences'][0]
                if 's0_tgt' not in list(sentence.keys()):
                    sentence['s0_tgt'] = sentence['tgt']
                if 's0_src' not in list(sentence.keys()):
                    sentence['s0_src'] = sentence['src']
                return sentence

            return None
        except Exception as e:
            log_exception("db connection exception ", AppContext.getContext(),
                          e)
            return None
Example #3
0
def GoogleVisionOCR(app_context, base_dir=config.BASE_DIR):

    log_debug(
        'google vision ocr process starting {}'.format(
            app_context.application_context), app_context.application_context)
    try:
        response, langs = process_input(app_context, base_dir)
        if response != None:
            return {
                'code': 200,
                'message': 'request completed',
                'rsp': response,
                'langs': langs
            }
        else:
            return {
                'code': 400,
                'message': 'Error occured during google vision ocr',
                'rsp': None
            }
    except Exception as e:
        log_exception("Error occured during google vision ocr  ",
                      app_context.application_context, e)
        return {
            'code': 400,
            'message': 'Error occured during google vision ocr ',
            'rsp': None
        }
Example #4
0
def TextDetection(app_context, base_dir=config.BASE_DIR):

    log_debug(
        'Block merger starting processing {}'.format(
            app_context.application_context), app_context.application_context)

    try:

        words, lines, images = get_text(app_context, base_dir)
        response = get_response(app_context, words, lines, images)

        if response != None:
            return {
                'code': 200,
                'message': 'request completed',
                'rsp': response
            }
        else:
            return {
                'code': 400,
                'message': 'Error occured during pdf to blocks conversion',
                'rsp': None
            }

    except Exception as e:
        log_exception(
            "Error occured during word detection conversion" + str(e),
            app_context.application_context, e)
        return {
            'code': 400,
            'message': 'Error occured during pdf to blocks conversion',
            'rsp': None
        }
Example #5
0
    def post(self):
        body = request.get_json()
        user_id = request.headers.get('userid')
        if user_id == None:
            user_id = request.headers.get('x-user-id')

        pages = body['pages']
        file_locale = ''

        if 'file_locale' in body:
            file_locale = body['file_locale']

        job_id = ''
        if 'job_id' in body:
            job_id = body['job_id']

        record_id = None
        if 'record_id' in body:
            record_id = body['record_id']

        src_lang = None
        if 'src_lang' in body:
            src_lang = body['src_lang']
        tgt_lang = None
        if 'tgt_lang' in body:
            tgt_lang = body['tgt_lang']

        if 'pages' not in body or user_id is None or record_id == None or src_lang == None or tgt_lang == None:
            AppContext.addRecordID(record_id)
            log_info(
                'Missing params in FileContentSaveResource {}, user_id:{}'.
                format(body, user_id), AppContext.getContext())
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400

        AppContext.addRecordID(record_id)
        log_info(
            "FileContentSaveResource record_id ({}) for user ({})".format(
                record_id, user_id), AppContext.getContext())

        try:
            if fileContentRepo.store(user_id, file_locale, record_id, pages,
                                     src_lang, tgt_lang) == False:
                res = CustomResponse(
                    Status.ERR_GLOBAL_MISSING_PARAMETERS.value, None)
                return res.getresjson(), 400
            AppContext.addRecordID(record_id)
            log_info(
                "FileContentSaveResource record_id ({}) for user ({}) saved".
                format(record_id, user_id), AppContext.getContext())
            res = CustomResponse(Status.SUCCESS.value, None)
            return res.getres()
        except Exception as e:
            AppContext.addRecordID(record_id)
            log_exception("FileContentSaveResource ", AppContext.getContext(),
                          e)
            res = CustomResponse(Status.ERR_GLOBAL_MISSING_PARAMETERS.value,
                                 None)
            return res.getresjson(), 400
Example #6
0
    def register_job(self, object_in):
        job_id = util.generate_job_id()

        try:
            response = {
                "input": object_in,
                "jobID": job_id,
                "status": "STARTED"
            }
            self.update_job_details(response, True)
            prod_res = producer.push_to_queue(response, jsonalign_job_topic)

            if prod_res:
                self.update_job_status("FAILED", object_in,
                                       prod_res["message"])
                response = {
                    "input": object_in,
                    "jobID": job_id,
                    "status": "FAILED",
                    "error": prod_res
                }
            return response
        except Exception as e:
            log_exception(
                "Exception while registering the alignment job: " + str(e),
                object_in, e)
            return None
Example #7
0
def process_info(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        file_images = []
        output = []
        for index, file_new in enumerate(files):
            start_time = time.time()
            file = get_json(file_new['file']['name'], base_dir)[0]
            file_properties = File(file)
            ocr_level, lang = get_ocr_config(file_new,
                                             file_properties.get_pages())
            file = preprocess_file(file_properties, lang, ocr_level)
            file['file'] = file_new['file']
            file['config'] = file_new['config']
            output.append(file)
            output[index]['status'] = {
                'code': 200,
                'message': "tesseract ocr successful"
            }
            end_time = time.time()
            extraction_time = (end_time - start_time) / len(
                file_properties.get_pages())
            log_info(
                'tesseract ocr per page completed in {}'.format(
                    extraction_time), app_context.application_context)
        app_context.application_context["outputs"] = output
        log_info("successfully completed tesseract ocr", None)
    except Exception as e:
        log_exception("Error occured during tesseract ocr ",
                      app_context.application_context, e)
        return None

    return app_context.application_context
Example #8
0
def replace_tags_with_original(text, date_original, url_dict, num_array,
                               num_map):
    '''
  Replaces dates,urls and numbers in the text with the original values
  in place of the tags
  '''
    try:
        res = text

        if len(text) == 0:
            return ""

        for url_tag, url in url_dict.items():
            res = text.replace(url_tag, url)

        log_info("response after url and date replacemnt:{}".format(res),
                 MODULE_CONTEXT)

        if len(num_map) == 0:
            ''' handling the case when model outputs a tag which is not in tagged_src(src is without any number'''
            for char in reversed(hindi_numbers):
                res = re.sub(r'NnUuMm' + char, "", res)
        num_map.reverse()
        for item in num_map:
            res = res.replace(item['tag'], str(item['no.']), 1)

        res = remove_extra_tags(res)
        log_info("response after tags replacement:{}".format(res),
                 MODULE_CONTEXT)
        return res
    except Exception as e:
        log_exception(
            "Error in parent except block of replace_tags_with_original_1 function, returning tagged output:{}"
            .format(e), MODULE_CONTEXT, e)
        return res
Example #9
0
    def update_sentence_by_s_id(self, record_id, user_id, sentence):
        SENTENCE_KEYS   = ['n_id', 'pred_score', 's_id', 'src', 'tgt']
        try:
            collections     = get_db()[DB_SCHEMA_NAME]

            results         = collections.update({'$and': [{'record_id': record_id}, {'created_by': user_id}, { 'data.tokenized_sentences': {'$elemMatch': {'s_id': {'$eq': sentence['s_id']}}}}]},
                                                {
                                                    '$set':
                                                    {
                                                        "data.tokenized_sentences.$.n_id" : sentence['n_id'],
                                                        "data.tokenized_sentences.$.src"  : sentence['src'],
                                                        "data.tokenized_sentences.$.tgt"  : sentence['tgt'],
                                                        "data.tokenized_sentences.$.save" : sentence['save'],
                                                        "data.tokenized_sentences.$.bleu_score" : sentence['bleu_score'],
                                                        "data.tokenized_sentences.$.time_spent_ms" : sentence['time_spent_ms'],
                                                        "data.tokenized_sentences.$.rating_score" : sentence['rating_score']
                                                    }
                                                }, upsert=False)

            if 'writeError' in list(results.keys()):
                return False
            return True        
        except Exception as e:
            log_exception("db connection exception ",  AppContext.getContext(), e)
            return False
Example #10
0
    def predict_primanet(self, image, craft_coords):
        try:
            #print(image,"iiiiiiiiiiiiiiiiiiiiiiiiii")
            #image   = cv2.imread("/home/naresh/anuvaad/anuvaad-etl/anuvaad-extractor/document-processor/layout-detector/prima/"+image)
            image = cv2.imread(image)
            height, width, channels = image.shape
            #image = cv2.imread(image)
            #image   = clean_image(image)

            #image   = image[..., ::-1]
            layout = model_primalaynet.detect(image)
            bbox, tag, score = self.prima_region(layout)
            ############### craft refinement logic
            bbox, tag, score = self.prima_craft_refinement(
                bbox, craft_coords, tag, score)
            layouts = self.update_box_format(bbox, tag, score)
            flag = True
            while flag == True:
                layouts, flag = self.merge_remove_overlap(
                    layouts, height, width)
            layouts = cell_layout(layouts, image)

            return layouts
        except Exception as e:
            log_exception("Error occured during prima layout detection ",
                          app_context.application_context, e)
            return None
Example #11
0
def update_num_arr(num_array, zero_prefix_num, i_zero, num_array_orignal):
    '''
  This is function is meant to handle zero prefix numbers like 09 or 000 which are converted to 9 or 0 during processing, We want them in original form i.e 09
  zero_prefix_num: this is the num that has to be transformed back with zero prefix(from 9 to 09, or, 0 to 000 originally)
  i_zero: indices of numbers with zero prefix in num_array_orignal
  ind: indices of zero prefix numbers in num_array descending

  Note: this function needs some fixing
  '''
    try:
        num_array_o = None
        num_array_o = num_array[:]

        ind = list()
        zero_prefix_num = np.unique(np.array(zero_prefix_num))
        for i in zero_prefix_num:
            for j, m in enumerate(num_array):
                if m == i:
                    ind.append(j)
        for k, l in enumerate(ind):
            num_array[l] = num_array_orignal[i_zero[k]]
        return num_array
    except Exception as e:
        log_exception(
            "Error in handle_date_url:update_num_arr,returning incoming num_array:{}"
            .format(e), MODULE_CONTEXT, e)
        return num_array_o
Example #12
0
 def wrapper(*args, **kwargs):
     try:
         output = method(*args, **kwargs)
         return output
     except Exception as e:
         log_exception('Error in response generation {}'.format(e), app_context.application_context, e)
         return None
def encode_itranslate_decode(i, src_lang, tgt_lang):
    try:
        i["src"] = [i["src"]]
        i["target_prefix"] = [i["target_prefix"]]

        translator = load_models.loaded_models[i["id"]]
        source_bpe = load_models.bpes[i["id"]][0]
        target_bpe = load_models.bpes[i["id"]][1]
        i["src"] = sentence_processor.preprocess(i["src"], src_lang)
        i["src"] = apply_bpe(i["src"], source_bpe)
        # apply bpe to constraints with target bpe
        prefix = apply_bpe(i["target_prefix"], target_bpe)
        i_final = sentence_processor.apply_lang_tags(i["src"], src_lang, tgt_lang)
        translation = translator.translate(i_final, constraints=prefix)
        translation = sentence_processor.postprocess(translation, tgt_lang)
        return translation

    except Exception as e:
        log_exception(
            "Unexpexcted error in encode_itranslate_decode: {} and {}".format(
                e, sys.exc_info()[0]
            ),
            MODULE_CONTEXT,
            e,
        )
        raise
Example #14
0
 def predict_primanet(self, image, craft_coords):
     try:
         image = cv2.imread(image)
         image = image[..., ::-1]
         layout = model_primalaynet.detect(image)
         boxes, coords, layout_class = self.prima_region(
             layout, craft_coords)
         final_coord = []
         for idx, coord in enumerate(coords):
             temp_dict = {}
             vert = []
             temp_dict['identifier'] = str(uuid.uuid4())
             vert.append({'x': coord[0], 'y': coord[1]})
             vert.append({'x': coord[2], 'y': coord[1]})
             vert.append({'x': coord[2], 'y': coord[3]})
             vert.append({'x': coord[0], 'y': coord[3]})
             temp_dict['boundingBox'] = {}
             temp_dict['boundingBox']["vertices"] = vert
             temp_dict['class'] = self.class_mapping(layout_class[idx])
             #temp_dict['text_left']  = coord[0]; temp_dict['text_top'] = coord[1]
             #temp_dict['text_width'] = abs((coord[2]-coord[0])); temp_dict['text_height'] = abs((coord[3]-coord[1]))
             final_coord.append(temp_dict)
         return final_coord
     except Exception as e:
         log_exception("Error occured during prima layout detection ",
                       app_context.application_context, e)
         return None
Example #15
0
    def get_document_total_page_count(self, record_id):
        try:
            collections = get_db()[DB_SCHEMA_NAME]
            results = collections.aggregate([{
                '$match': {
                    'record_id': record_id
                }
            }, {
                '$group': {
                    '_id': '$record_id',
                    'page_count': {
                        '$max': "$page_no"
                    }
                }
            }])

            count = 0
            for result in results:
                count = result['page_count']
                break

            return count
        except Exception as e:
            log_exception("db connection exception ", AppContext.getContext(),
                          e)
            return 0
Example #16
0
def encode_translate_decode_v2(i):
    try:
        log_info("Inside encode_translate_decode_v2 function", MODULE_CONTEXT)
        model_path, sp_encoder, sp_decoder = get_model_path(i['id'])
        translator = load_models.loaded_models[i['id']]
        i['src'] = sp.encode_line_v2(sp_encoder, i['src'])
        log_info("SP encoded sent: %s" % str(i['src']), MODULE_CONTEXT)
        input_sw = str(i['src'])
        m_out = translator.translate_batch([i['src']],
                                           beam_size=5,
                                           num_hypotheses=1)
        output_sw = " ".join(m_out[0][0]['tokens'])
        log_info("output from model: {}".format(output_sw), MODULE_CONTEXT)
        scores = m_out[0][0]['score']
        translation = multiple_hypothesis_decoding_v2(m_out[0], sp_decoder)[0]
        log_info("SP decoded sent: %s" % str(translation), MODULE_CONTEXT)
        return translation, scores, input_sw, output_sw
    except ServerModelError as e:
        log_exception(
            "ServerModelError error in encode_translate_decode_v2: {} and {}".
            format(e,
                   sys.exc_info()[0]), MODULE_CONTEXT, e)
        raise
    except Exception as e:
        log_exception(
            "Unexpexcted error in encode_translate_decode_v2: {} and {}".
            format(e,
                   sys.exc_info()[0]), MODULE_CONTEXT, e)
        raise
Example #17
0
 def get_nmt_url_body(self, block_translate_input, nmt_txt):
     model = block_translate_input["input"]["model"]
     nmt_in = {
         "src_list": nmt_txt,
         "source_language_code": model["source_language_code"],
         "target_language_code": model["target_language_code"],
         "model_id": model["model_id"]
     }
     try:
         host = model["connection_details"]["translation"]["host"]
         api_host = os.environ.get(host, 'NA')
         endpoint = model["connection_details"]["translation"][
             "api_endpoint"]
         api_endpoint = os.environ.get(endpoint, 'NA')
         if api_host == "NA" or api_endpoint == "NA":
             log_info("Falling back to Anuvaad NMT translate URL....",
                      block_translate_input)
             return nmt_translate_url, nmt_in
         url = api_host + api_endpoint
         return url, nmt_in
     except Exception as e:
         log_exception(
             "Exception while fetching API conn details: {}".format(str(e)),
             block_translate_input, None)
     log_info("Falling back to Anuvaad NMT translate URL....",
              block_translate_input)
     return nmt_translate_url, nmt_in
Example #18
0
def get_segmented_regions(app_context,base_dir) :
    try:
        files       = get_files(app_context.application_context)
        output      = []
        for index,file in enumerate(files):
            file   = get_json(base_dir, file['file']['name'])
            file_properties = File(file)
            pages = file_properties.get_pages()
            page_counts = len(pages)
            start_time = time.time()
            for page_index in range(page_counts):
                print('processing for page   :  ', page_index)
                # page_lines   =  file_properties.get_lines(page_index)
                # page_regions =  file_properties.get_regions(page_index)
                # page_words   =  file_properties.get_words(page_index)
                #font_meta    = font_properties(file_properties.get_page(page_index))
                font_meta  = []
                #page_regions =  region_unifier.region_unifier(page_lines,page_regions)
                #file_properties.set_regions(page_index, segment_regions(page_words,page_lines,page_regions))
                file_properties.set_font_properties(page_index,font_meta)

            output.append(file_properties.get_file())
            output[index]['status']= {'message':"block-segmenter successful"}
            end_time            = time.time()
            extraction_time     = (end_time - start_time)/page_counts
            log_info('block segmentation per page completed in {}'.format(extraction_time), app_context.application_context)
        app_context.application_context["outputs"] =output
        log_info("successfully completed block segmentation", None)
    except Exception as e:
        log_exception("Error occured during block segmentation ",  app_context.application_context, e)
        return None

    return app_context.application_context
Example #19
0
def process_tokenization_kf():
    file_ops = FileOperation()
    DOWNLOAD_FOLDER =file_ops.file_download(config.download_folder)
    # instatiation of consumer for respective topic
    try:
        consumer_class = Consumer(config.input_topic, config.bootstrap_server)
        consumer = consumer_class.consumer_instantiate()
        log_info("process_tokenization_kf : trying to receive value from consumer ", None)
        for msg in consumer:
            data = msg.value
            log_info("process_tokenization_kf : received input json from input topic consumer ", data)
            task_id = str("TOK-" + str(time.time()).replace('.', '')[0:13])
            task_starttime = eval(str(time.time()).replace('.', '')[0:13])
            input_files, workflow_id, jobid, tool_name, step_order, user_id = file_ops.json_input_format(data)
            response_gen = Response(data, DOWNLOAD_FOLDER)
            file_value_response = response_gen.workflow_response(task_id, task_starttime)
            if "errorID" not in file_value_response.keys():
                producer = Producer()
                producer.push_data_to_queue(config.output_topic, file_value_response, data, task_id)
            else:
                log_error("process_tokenization_kf : error send to error handler", data, None)
    except KafkaConsumerError as e:
        response_custom = CustomResponse(Status.ERR_STATUS.value, None, None)
        response_custom.status_code['message'] = str(e)
        file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True)
        log_exception("process_tokenization_kf : Consumer didn't instantiate", None, e)
    except KafkaProducerError as e:
        response_custom = e.code
        response_custom['message'] = e.message      
        file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True)
        log_exception("process_tokenization_kf : response send to topic %s"%(config.output_topic), data, e)
Example #20
0
def handle_sentences_wo_stop(language, sentence_array):
    '''
    Handles sentences in the array which do not have a sentence
    ending puncuation by adding it. Used in batch translation.
    '''
    try:
        if language is None:
            return sentence_array, []
        else:
            log_info("Inside handle_sentences_wo_stop", MODULE_CONTEXT)
            stop_puncs = misc.get_language_stop_puncs(language)
            full_stop_or_purnviram = stop_puncs[0]
            sent_indices_wo_stop = []
            for i, sentence in enumerate(sentence_array):
                if misc.is_sentence_wo_stop(sentence, stop_puncs):
                    sent_indices_wo_stop.append(i)
                    sentence_array[i] = misc.add_stop_punc(
                        sentence_array[i], full_stop_or_purnviram)

            return sentence_array, sent_indices_wo_stop

    except Exception as e:
        log_exception("Error in handle_sentences_wo_stop: {}".format(e),
                      MODULE_CONTEXT, e)
        return sentence_array, []
Example #21
0
def TesseractOCR(app_context, base_dir=config.BASE_DIR):

    log_debug(
        'tesseract ocr process starting {}'.format(
            app_context.application_context), app_context.application_context)
    try:
        response = process_info(app_context, base_dir)
        if response != None:
            return {
                'code': 200,
                'message': 'request completed',
                'rsp': response
            }
        else:
            return {
                'code': 400,
                'message': 'Error occured during tesseract ocr',
                'rsp': None
            }
    except Exception as e:
        log_exception("Error occured during tesseract ocr  ",
                      app_context.application_context, e)
        return {
            'code': 400,
            'message': 'Error occured during tesseract ocr ',
            'rsp': None
        }
Example #22
0
 def consumer_instantiate(self):
     try:
         consumer = KafkaConsumer(self.topic_name, bootstrap_servers = list((self.server_address).split(",")), auto_offset_reset = 'latest', group_id = config.CONSUMER_GROUP, enable_auto_commit=True)
         log_info("consumer_instantiate : Consumer returned for topic: %s"%(self.topic_name), None)
         return consumer
     except Exception as e:
         log_exception("consumer_instantiate : error occured for consumer topic: %s"%(self.topic_name), None, e)
Example #23
0
def save_page_res(res, file_name):
    try:
        tmp_file = copy.deepcopy(res['rsp'])
        del tmp_file['input']
        tmp_file['files'] = res['rsp']['outputs']
        del tmp_file['outputs']
        json_file_name = file_name['output'][0]['outputFile']
        for file in [tmp_file]:
            recordID = file['jobID'] + '|' + json_file_name
            page_idx = 0
            total_pages = len(file['files'][0]['pages'])
            file['files'][0]['config'] = copy.deepcopy(
                file['files'][0]['config']['OCR'])
            save_file = copy.deepcopy(file)
            save_file['recordID'] = recordID
            while page_idx < total_pages:
                pages = file['files'][0]['pages'][page_idx:page_idx +
                                                  SAVE_NO_PAGE]
                save_file['files'][0]['pages'] = pages
                page_idx = page_idx + SAVE_NO_PAGE
                log_info(
                    "started saving data to database with record id: " +
                    str(recordID), app_context.application_context)
                rsp = requests.post(SAVE_URL, json=save_file)
                log_info(
                    "successfully saved data to database with record id: " +
                    str(recordID), app_context.application_context)
    except Exception as e:
        log_exception("Error occured during saving page response",
                      app_context.application_context, e)
Example #24
0
def LayoutDetection(app_context, base_dir=config.BASE_DIR):

    log_debug(
        'layout detection process starting {}'.format(
            app_context.application_context), app_context.application_context)
    try:

        response = get_layout(app_context)

        if response != None:
            return {
                'code': 200,
                'message': 'request completed',
                'rsp': response
            }
        else:
            return {
                'code': 400,
                'message': 'Error occured during layout detection',
                'rsp': None
            }
    except Exception as e:
        log_exception("Error occured during layout detection ",
                      app_context.application_context, e)
        return {
            'code': 400,
            'message': 'Error occured during layout detection ',
            'rsp': None
        }
Example #25
0
def get_response(app_context, words, lines, images):

    output = []
    files = get_files(app_context.application_context)

    for file_index, file in enumerate(files):
        file_prperties = FileOutput(file)
        try :
            for page_index, page in enumerate(images[file_index]):
                if len(words)!=0:
                    page_words  = words[file_index][page_index]
                else:
                    page_words = []
                if len(lines)!=0:
                    page_lines = lines[file_index][page_index]
                else:
                    page_lines = []
                page_properties = Page(page_words, page_lines, page)
                file_prperties.set_page(page_properties.get_page())
                file_prperties.set_page_info(page)
            file_prperties.set_staus(True)
        except Exception as e:
            file_prperties.set_staus(False)
            log_exception("Error occured during response generation" + str(e), app_context.application_context, e)
        
        output.append(file_prperties.get_file())

    app_context.application_context['outputs'] = output

    return app_context.application_context
Example #26
0
def get_text(path, page_dict, font_info):
    #path = "/home/naresh/anuvaad/anuvaad-etl/anuvaad-extractor/document-processor/ocr/gv-document-digitization/"+path

    try:

        if config.CLEAN_BACKGROUND:
            img = cv2.imread(path)
            img[175 < img] = 255
            masked_path = path.split('.jpg')[0] + "_watermarks.jpg"
            cv2.imwrite(masked_path, img)
        else:
            masked_path = path

        with io.open(masked_path, 'rb') as image_file:
            content = image_file.read()
        image = vision.types.Image(content=content)
        response = client.document_text_detection(image=image)
        page_dict, page_lines = get_document_bounds(
            response.full_text_annotation, page_dict, font_info)
        return page_dict, page_lines

    except Exception as e:
        log_exception("Error occured during text_extraction  {}".format(e),
                      app_context.application_context, e)
        return None, None
Example #27
0
def process_input(app_context, base_dir):
    try:
        files = get_files(app_context.application_context)
        output_files = []
        langs = []
        for index, file in enumerate(files):
            file_output = {"status": {}}
            file = get_json(base_dir, file['file']['name'])[0]

            file_properties = File(file)
            if "page_info" in file.keys():
                page_paths = file_properties.get_pages()
            else:
                page_paths = doc_pre_processing(file['file']['name'],
                                                config.BASE_DIR)

            page_res = text_extraction(file_properties, page_paths, file)
            output_files.append(page_res)
            langs.append(file_properties.get_language())

        app_context.application_context["outputs"] = output_files
        log_info("successfully completed google vision ocr", None)

    except Exception as e:
        log_exception("Error occured during google vision ocr",
                      app_context.application_context, e)
        return None, None

    return app_context.application_context, langs
Example #28
0
def vision_ocr_request_worker():
    file_ops            = FileOperation()
    DOWNLOAD_FOLDER     = file_ops.create_file_download_dir(config.download_folder)
    producer_tok        = Producer(config.bootstrap_server)
    log_info("vision_ocr_request_worker : starting thread ", LOG_WITHOUT_CONTEXT)

    while True:
        data            = processQueue.get(block=True)
        #################
        task_id         = str("vision_ocr" + str(time.time()).replace('.', ''))
        ###################
        task_starttime  = str(time.time()).replace('.', '')
        input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data)
        
        log_info("vision_ocr_request_worker processing -- received message "+str(jobid), data)

        try:
            response_gen    = Response(data, DOWNLOAD_FOLDER)

            file_value_response = response_gen.workflow_response(task_id, task_starttime, False)
            if file_value_response != None:
                if "errorID" not in file_value_response.keys():
                    push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id,data)
                    log_info("vision_ocr_request_worker : response send to topic %s"%(config.output_topic), LOG_WITHOUT_CONTEXT)
                else:
                    log_info("vision_ocr_request_worker : error send to error handler", data)

            log_info('vision_ocr_request_worker - request in internal queue {}'.format(Queue.qsize()), data)

            processQueue.task_done()
        except Exception as e:
            log_exception("vision_ocr_request_worker ",  LOG_WITHOUT_CONTEXT, e)

        controlQueue.put(1)
Example #29
0
def core_consume():
    try:
        wfmservice = WFMService()
        topics = [anu_etl_wfm_core_topic]
        consumer = instantiate(topics)
        rand_str = ''.join(
            random.choice(string.ascii_letters) for i in range(4))
        prefix = "WFM-Core-" + "(" + rand_str + ")"
        log_info(prefix + " | Running..........", None)
        log_info(prefix + " | Topics: " + str(topics), None)
        while True:
            for msg in consumer:
                data = {}
                try:
                    if msg:
                        data = msg.value
                        log_info(
                            prefix + " | Received on Topic: " + msg.topic +
                            " | Partition: " + str(msg.partition), data)
                        wfmservice.initiate_wf(data)
                except Exception as e:
                    log_exception(
                        prefix + " | Exception while consuming: " + str(e),
                        data, e)
                    post_error("WFM_CORE_CONSUMER_ERROR",
                               "Exception while consuming: " + str(e), None)
    except Exception as e:
        log_exception(
            "Exception while starting the wfm core consumer: " + str(e), None,
            e)
        post_error("WFM_CONSUMER_ERROR",
                   "Exception while starting wfm core consumer: " + str(e),
                   None)
Example #30
0
def purnaviram_applier(src,tgt):
    '''
    For english to hindi translation
    '''
    try:
        if tgt is None or len(tgt.split()) == 0:
            return tgt
        if len(src.split()) < 5:
            return tgt
        if src.endswith('.') and tgt.endswith('ред'):
            return tgt
        elif src.endswith('.') and tgt[-1] != 'ред':
            if tgt.endswith('.'):
                log_info("Replacing '.' with purnaviram",MODULE_CONTEXT)
                tgt = tgt[:-1] + str("ред")
            else:
                log_info("Adding the missing purnaviram",MODULE_CONTEXT)
                tgt = tgt + str("ред")
            return tgt
        else:
            return tgt    

    except Exception as e:
        log_exception("Error in purnaviram applier, returning original tgt: {}".format(e),MODULE_CONTEXT,e)
        return tgt