def vision_ocr_request_worker(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) log_info("vision_ocr_request_worker : starting thread ", LOG_WITHOUT_CONTEXT) while True: data = processQueue.get(block=True) ################# task_id = str("vision_ocr" + str(time.time()).replace('.', '')) ################### task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data) log_info("vision_ocr_request_worker processing -- received message "+str(jobid), data) try: response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response(task_id, task_starttime, False) if file_value_response != None: if "errorID" not in file_value_response.keys(): push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id,data) log_info("vision_ocr_request_worker : response send to topic %s"%(config.output_topic), LOG_WITHOUT_CONTEXT) else: log_info("vision_ocr_request_worker : error send to error handler", data) log_info('vision_ocr_request_worker - request in internal queue {}'.format(Queue.qsize()), data) processQueue.task_done() except Exception as e: log_exception("vision_ocr_request_worker ", LOG_WITHOUT_CONTEXT, e) controlQueue.put(1)
def process_block_segmenter_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) # instatiation of consumer for respective topic try: consumer = consumer_validator() log_info( "process_document_segmenter_kf : trying to receive value from consumer ", LOG_WITHOUT_CONTEXT) for msg in consumer: if Consumer.get_json_data(msg.value) == None: log_info( 'process_document_segmenter_kf - received invalid data {}'. format(msg.value), None) continue data = Consumer.get_json_data(msg.value) jobid = data['jobID'] log_info( 'process_document_segmenter_kf - received message from kafka, dumping into internal queue', data) input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( data) #if input_files[0]['locale'] == 'en': ############# #################################### Queue.put(data) log_info( 'process_document_segmenter_kf - request in internal queue {}'. format(Queue.qsize()), data) ######################################## # else: # blockMergerOCRQueue.put(data) # log_info('process_block_merger_kf - request in internal OCR queue {}'.format(blockMergerOCRQueue.qsize()), data) # We should reject kafka request if internal queue size become too-much. # except KafkaConsumerError as e: response_custom = {} response_custom['message'] = str(e) file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True) log_exception( "process_layout_detector_kf : Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = {} response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception( "process_layout_detector_kf : response send to topic %s" % (config.output_topic), None, e)
def process_vision_ocr_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) # instatiation of consumer for respective topic try: consumer = consumer_validator() log_info("process_vision_ocr_kf : trying to receive value from consumer ", LOG_WITHOUT_CONTEXT) while True: wait_for_control = controlQueue.get(block=True) for msg in consumer: if Consumer.get_json_data(msg.value) == None: log_info('process_vision_ocr_kf - received invalid data {}'.format(msg.value), None) continue data = Consumer.get_json_data(msg.value) consumer.commit() # <--- This is what we need # Optionally, To check if everything went good print('New Kafka offset: %s' % consumer.committed(TopicPartition(config.input_topic, msg.partition))) jobid = data['jobID'] log_info('process_vision_ocr_kf - received message from kafka, dumping into internal queue', data) input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format(data) #if input_files[0]['locale'] == 'en': ############# #################################### processQueue.put(data) log_info('process_vision_ocr_kf - request in internal queue {}'.format(Queue.qsize()), data) break ######################################## # else: # blockMergerOCRQueue.put(data) # log_info('process_block_merger_kf - request in internal OCR queue {}'.format(blockMergerOCRQueue.qsize()), data) # We should reject kafka request if internal queue size become too-much. # except KafkaConsumerError as e: response_custom = {} response_custom['message'] = str(e) file_ops.error_handler(response_custom, "KAFKA_CONSUMER_ERROR", True) log_exception("process_vision_ocr_kf : Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = {} response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_vision_ocr_kf : response send to topic %s"%(config.output_topic), None, e)
def process_merger_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) task_id = str("BM-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') # instatiation of consumer for respective topic try: consumer_class = Consumer(config.input_topic, config.bootstrap_server) consumer = consumer_class.consumer_instantiate() log_info("process_merger_kf", "trying to receive value from consumer", None) thread_instance = 0 for msg in consumer: try: data = msg.value task_id = str("BM-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( data) log_info("process_merger_kf", "kafka request arrived ", jobid) response_gen = Response(data, DOWNLOAD_FOLDER) t1 = threading.Thread( target=response_gen.multi_thred_block_merger, args=(task_id, task_starttime, jobid), name='BM-thread-' + str(thread_instance)) t1.start() thread_instance += 1 log_info("multithread", "block-merger running on multithread", None) ''' file_value_response = response_gen.workflow_response(task_id, task_starttime) if "errorID" not in file_value_response.keys(): producer = Producer() producer.push_data_to_queue(config.output_topic, file_value_response, jobid, task_id) else: log_info("process_merger_kf", "error send to error handler", jobid)''' except Exception as e: log_exception("process_pdf_kf", "exception while consuming the records", jobid, e) except KafkaConsumerError as e: response_custom = CustomResponse(Status.ERR_STATUS.value, None, None) response_custom.status_code['message'] = str(e) file_ops.error_handler(response_custom.status_code, "KAFKA_CONSUMER_ERROR", True) log_exception("process_pdf_kf", "Consumer didn't instantiate", None, e) except KafkaProducerError as e: response_custom = e.code response_custom['message'] = e.message file_ops.error_handler(response_custom, "KAFKA_PRODUCER_ERROR", True) log_exception("process_pdf_kf", "response send to topic %s" % (config.output_topic), response_custom['jobID'], e)
def word_detector_request_worker(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) log_info("word_detector_request_worker : starting thread ", LOG_WITHOUT_CONTEXT) while True: data = Queue.get(block=True) ################# task_id = str("word_detector" + str(time.time()).replace('.', '')) ################### task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( data) log_info( "word_detector_request_worker processing -- received message " + str(jobid), data) try: response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response( task_id, task_starttime, False) if file_value_response != None: push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id, data) log_info( "word_detector_request_worker : response send to topic %s" % (config.output_topic), LOG_WITHOUT_CONTEXT) else: erro_obj = { 'code': 400, 'jobID': jobid, 'message': "Word detector failed" } producer_tok.push_data_to_queue( config.KAFKA_ANUVAAD_ETL_WF_ERROR_TOPIC, erro_obj) log_info( "word_detector_request_worker : error send to error handler", data) log_info( 'word_detector_request_worker - request in internal queue {}'. format(Queue.qsize()), data) Queue.task_done() except Exception as e: log_exception("word_detector_request_worker ", LOG_WITHOUT_CONTEXT, e)
def block_merger_request_worker(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server) while True: data = blockMergerQueue.get(block=True) task_id = str("BM-" + str(time.time()).replace('.', '')) task_starttime = str(time.time()).replace('.', '') input_files, workflow_id, jobid, tool_name, step_order = file_ops.json_input_format( data) log_info( "block_merger_request_worker processing -- received message " + str(jobid), data) response_gen = Response(data, DOWNLOAD_FOLDER) file_value_response = response_gen.workflow_response( task_id, task_starttime, False) if file_value_response != None: if "errorID" not in file_value_response.keys(): push_output(producer_tok, config.output_topic, file_value_response, jobid, task_id) log_info( "process_block_merger_kf : response send to topic %s" % (config.output_topic), None) else: log_info( "process_block_merger_kf : error send to error handler", jobid) log_info( 'block_merger_request_worker - request in internal queue {}'. format(blockMergerQueue.qsize()), jobid) blockMergerQueue.task_done()
from flask.json import jsonify from flask import request from src.utilities.utils import FileOperation from src.resources.response_gen import Response from src.errors.error_validator import ValidationResponse from src.errors.errors_exception import FormatError from src.utilities.model_response import Status from anuvaad_auditor.loghandler import log_info from anuvaad_auditor.loghandler import log_error import config import time import src.utilities.app_context as app_context # sentence block merging file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) # rest request for block merging workflow service class Tesseract_OCR_WF(Resource): # reading json request and reurnung final response def post(self): json_data = request.get_json(force=True) app_context.init() app_context.application_context = json_data log_info("Resource Tesseract_OCR_WF Tesseract_OCR service started", app_context.application_context) task_id = str("TESSOCR-" + str(time.time()).replace('.', '')[0:13]) task_starttime = eval(str(time.time()).replace('.', '')[0:13]) #json_data = request.get_json(force = True)
def process_vision_ocr_kf(): file_ops = FileOperation() DOWNLOAD_FOLDER = file_ops.create_file_download_dir(config.download_folder) producer_tok = Producer(config.bootstrap_server)