def GoogleVisionOCR(app_context, base_dir=config.BASE_DIR): log_debug( 'google vision ocr process starting {}'.format( app_context.application_context), app_context.application_context) try: response, langs = process_input(app_context, base_dir) if response != None: return { 'code': 200, 'message': 'request completed', 'rsp': response, 'langs': langs } else: return { 'code': 400, 'message': 'Error occured during google vision ocr', 'rsp': None } except Exception as e: log_exception("Error occured during google vision ocr ", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during google vision ocr ', 'rsp': None }
def BlockSegmenter(app_context, base_dir=config.BASE_DIR): log_debug( 'block segmentation process starting {}'.format( app_context.application_context), app_context.application_context) try: response = get_segmented_regions(app_context, base_dir) if response != None: return { 'code': 200, 'message': 'request completed', 'rsp': response } else: return { 'code': 400, 'message': 'Error occured during block segmentation', 'rsp': None } except Exception as e: log_exception("Error occured during block segmentation ", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during layout detection ', 'rsp': None }
def TextDetection(app_context, base_dir=config.BASE_DIR): log_debug( 'Block merger starting processing {}'.format( app_context.application_context), app_context.application_context) try: words, lines, images = get_text(app_context, base_dir) response = get_response(app_context, words, lines, images) if response != None: return { 'code': 200, 'message': 'request completed', 'rsp': response } else: return { 'code': 400, 'message': 'Error occured during pdf to blocks conversion', 'rsp': None } except Exception as e: log_exception( "Error occured during word detection conversion" + str(e), app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during pdf to blocks conversion', 'rsp': None }
def TesseractOCR(app_context, base_dir=config.BASE_DIR): log_debug( 'tesseract ocr process starting {}'.format( app_context.application_context), app_context.application_context) try: response = process_info(app_context, base_dir) if response != None: return { 'code': 200, 'message': 'request completed', 'rsp': response } else: return { 'code': 400, 'message': 'Error occured during tesseract ocr', 'rsp': None } except Exception as e: log_exception("Error occured during tesseract ocr ", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during tesseract ocr ', 'rsp': None }
def DocumentStructure(app_context, file_name, lang='en', base_dir=config.BASE_DIR): log_debug( 'Block merger starting processing {}'.format( app_context.application_context), app_context.application_context) img_dfs, xml_dfs, working_dir, page_width, page_height, pdf_bg_img_filepaths, pdf_image_paths = doc_pre_processing( file_name, base_dir, lang) if xml_dfs == None: return { 'code': 400, 'message': 'Document pre-processing failed, check your installation', 'rsp': None } df = extract_word_bbox(pdf_image_paths[0]) print(df) text_blocks_count = check_text(xml_dfs) if text_blocks_count == 0: log_info( "DocumentStructure : looks like the file is either empty or scanned type, currently we support Class-1 document.", app_context.application_context) return { 'code': 400, 'message': 'looks like the file is of scanned type, currently we support Class-1 document.', 'rsp': None } try: text_block_dfs, table_dfs, line_dfs, bg_dfs = doc_structure_analysis( xml_dfs, img_dfs, working_dir, lang, page_width, page_height, pdf_bg_img_filepaths, pdf_image_paths) response = doc_structure_response(bg_dfs, text_block_dfs, table_dfs, line_dfs, page_width, page_height) log_info( "DocumentStructure : successfully received blocks in json response", app_context.application_context) return {'code': 200, 'message': 'request completed', 'rsp': response} except Exception as e: log_exception("Error occured during pdf to blocks conversion", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during pdf to blocks conversion', 'rsp': None }
def LayoutDetection(app_context): log_debug('layout detection process starting {}'.format(app_context.application_context), app_context.application_context) try: response = get_layout(app_context) return { 'code': 200, 'message': 'request completed', 'rsp': response } except Exception as e: log_exception("Error occured during layout detection ", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during layout detection ', 'rsp': None }
def DocumentStructure(app_context, file_name, lang='en', base_dir=config.BASE_DIR, page_layout='single_column'): log_debug( 'Block merger starting processing {}'.format( app_context.application_context), app_context.application_context) try: doc_structure_compose = compose(generate_response, break_blocks, merge_vertically, merge_horizontally, extract_images_and_text_regions) response = doc_structure_compose(file_name, base_dir, lang, page_layout) return {'code': 200, 'message': 'request completed', 'rsp': response} except Exception as e: log_exception("Error occured during pdf to blocks conversion", app_context.application_context, e) return { 'code': 400, 'message': 'Error occured during pdf to blocks conversion', 'rsp': None }