def process_input(app_context, base_dir): try: files = get_files(app_context.application_context) output_files = [] langs = [] for index, file in enumerate(files): file_output = {"status": {}} file = get_json(base_dir, file['file']['name'])[0] file_properties = File(file) if "page_info" in file.keys(): page_paths = file_properties.get_pages() else: page_paths = doc_pre_processing(file['file']['name'], config.BASE_DIR) page_res = text_extraction(file_properties, page_paths, file) output_files.append(page_res) langs.append(file_properties.get_language()) app_context.application_context["outputs"] = output_files log_info("successfully completed google vision ocr", None) except Exception as e: log_exception("Error occured during google vision ocr", app_context.application_context, e) return None, None return app_context.application_context, langs
def process_input(app_context): try: files = get_files(app_context.application_context) output_files = [] for index, file in enumerate(files): file_output = {"status": {}} file_properties = File(file) if "page_info" in file.keys(): page_paths = file_properties.get_pages() else: page_paths = doc_pre_processing(file['file']['name'], config.BASE_DIR) page_res = text_extraction(page_paths) file_output["page_info"] = page_paths file_output["file"] = file file_output["pages"] = page_res file_output['status'][ 'message'] = "google-vision ocr run successfully" output_files.append(file_output) app_context.application_context["outputs"] = output_files log_info("successfully completed google vision ocr", None) except Exception as e: log_exception("Error occured during google vision ocr", app_context.application_context, e) return None return app_context.application_context
def preprocess_file(file_properties, lang, ocr_level): file = file_properties.get_file() page_paths = file_properties.get_pages() width, height = file_properties.get_pageinfo(0) for page_index, page_path in enumerate(page_paths): page_regions = file_properties.get_regions(page_index) #page_path = '/'.join(page_path.split('/')[-4:]) mode_height = frequent_height(file_properties.get_lines(page_index)) if config.OCR_LEVEL[ocr_level] == 'words': for idx, region in enumerate(page_regions): if region['class'] in ["TEXT", "TABLE"]: region_lines = file_properties.get_region_lines( page_index, idx) for line_index, line in enumerate(region_lines): region_words = file_properties.get_region_words( page_index, idx, line_index) if config.IS_DYNAMIC: region_words_org = coord_adjustment( page_path, region_words) region_ocr = text_extraction( lang, page_path, region_words_org, region_words, width, height, mode_height) else: region_ocr = text_extraction( lang, page_path, region_words, region_words, width, height, mode_height) file['pages'][page_index]['regions'][idx]['children'][ line_index]['children'] = region_ocr else: file['pages'][page_index]['regions'][idx] = copy.deepcopy( region) file['pages'][page_index]['regions'][idx][ 'children'] = merge_text( file['pages'][page_index]['regions'][idx]['children'], merge_tess_confidence=True) file['pages'][page_index]['regions'] = merge_text( file['pages'][page_index]['regions']) if config.OCR_LEVEL[ocr_level] == 'lines': for idx, region in enumerate(page_regions): if region['class'] in ["TEXT", "TABLE"]: region_lines = file_properties.get_region_lines( page_index, idx) if config.IS_DYNAMIC: region_lines_org = coord_adjustment( page_path, region_lines) region_ocr = text_extraction(lang, page_path, region_lines_org, region_lines, width, height, mode_height) else: region_ocr = text_extraction(lang, page_path, region_lines, region_lines, width, height, mode_height) file['pages'][page_index]['regions'][idx][ 'children'] = region_ocr else: file['pages'][page_index]['regions'][idx] = copy.deepcopy( region) file['pages'][page_index]['regions'] = merge_text( file['pages'][page_index]['regions']) return file
def preprocess_file(file_properties, lang, ocr_level): file = file_properties.get_file() page_paths = file_properties.get_pages() width, height = file_properties.get_pageinfo(0) mask_page_path = [] for page_index, page_path in enumerate(page_paths): print('processing for page : '.format(([page_index]))) page_regions = file_properties.get_regions(page_index) #page_path = '/'.join(page_path.split('/')[-4:]) save_path = mask_image(page_path, page_regions, page_index, file_properties, width, height) file = set_bg_image(file, save_path, page_index) mode_height = frequent_height(file_properties.get_lines(page_index)) if config.OCR_LEVEL[ocr_level] == 'words': for idx, region in enumerate(page_regions): if region['class'] in config.ocr_class: region_lines = file_properties.get_region_lines( page_index, idx) for line_index, line in enumerate(region_lines): region_words = file_properties.get_region_words( page_index, idx, line_index) if config.IS_DYNAMIC and region[ 'class'] in config.DYNAMIC_CLASS: region_words_org = coord_adjustment( page_path, copy.deepcopy(region_words)) region_ocr = text_extraction( lang, page_path, region_words_org, region_words, width, height, mode_height) else: region_ocr = text_extraction( lang, page_path, region_words, region_words, width, height, mode_height) #file['pages'][page_index]['regions'][idx]['children'][line_index]['children'] = region_ocr file['pages'][page_index]['regions'][idx]['regions'][ line_index]['regions'] = region_ocr else: file['pages'][page_index]['regions'][idx] = copy.deepcopy( region) #file['pages'][page_index]['regions'][idx]['children'] = merge_text(file['pages'][page_index]['regions'][idx]['children'],merge_tess_confidence=True) file['pages'][page_index]['regions'][idx][ 'regions'] = merge_text( file['pages'][page_index]['regions'][idx]['regions'], merge_tess_confidence=True) #file['pages'][page_index]['regions'] = merge_text(file['pages'][page_index]['regions']) if config.OCR_LEVEL[ocr_level] == 'lines': for idx, region in enumerate(page_regions): if region['class'] in config.ocr_class: region_lines = file_properties.get_region_lines( page_index, idx) if config.IS_DYNAMIC and region[ 'class'] in config.DYNAMIC_CLASS: region_lines_org = coord_adjustment( page_path, copy.deepcopy(region_lines)) region_ocr = text_extraction(lang, page_path, region_lines_org, region_lines, width, height, mode_height) else: region_ocr = text_extraction(lang, page_path, region_lines, region_lines, width, height, mode_height) file['pages'][page_index]['regions'][idx][ 'children'] = region_ocr else: file['pages'][page_index]['regions'][idx] = copy.deepcopy( region) file['pages'][page_index]['regions'] = merge_text( file['pages'][page_index]['regions']) ''' masking out images based on word coordinates ''' log_info("successfully completed ocr for page {}".format(page_index), app_context.application_context) #mask_page_path.append(save_path) #file['bg_image_paths'] = mask_page_path return file