def main(): args = parse_arguments() create_dir_if_not_exists(args.output_path) input_paths = args.input_paths files_to_process = [ f for f in os.listdir(input_paths[0]) if os.path.splitext(f)[1].lower() == '.xml' ] print('input_paths', input_paths) arabic_helper = ArabicHelper() for xml_file_name in files_to_process: print(xml_file_name) input_layouts = [] for input_path in input_paths: try: page_layout = PageLayout( file=os.path.join(input_path, xml_file_name)) page_layout.load_logits( os.path.join( input_path, os.path.splitext(xml_file_name)[0] + '.logits')) input_layouts.append(page_layout) except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except Exception as e: print( f'ERROR: Failed to load Page XML or .logit file "{xml_file_name}" from "{input_path}".' ) print(e) traceback.print_exc() merge_layouts(input_layouts) merged_layout = input_layouts[0] if args.min_confidence > 0: for region in merged_layout.regions: region.lines = \ [l for l in region.lines if l.transcription_confidence and l.transcription_confidence > args.min_confidence] if args.fix_arabic_order: for line in merged_layout.lines_iterator(): if arabic_helper.is_arabic_line(line.transcription): line.transcription = arabic_helper.label_form_to_string( line.transcription) merged_layout.to_pagexml(os.path.join(args.output_path, xml_file_name)) merged_layout.save_logits( os.path.join(args.output_path, os.path.splitext(xml_file_name)[0] + '.logits'))
def process_page(self, img: np.ndarray, page_layout: PageLayout): polygons = SimpleThresholdRegion._compute_layout(img) page_layout.regions = [ RegionLayout(f'r-{idx}', polygon[:, ::-1]) for idx, polygon in enumerate(polygons) ] return page_layout
def read_page_xml(path): try: page_layout = PageLayout(file=path) except: print(f'Warning: unable to load page xml "{path}"') return None return page_layout
def process_page(self, image, page_layout: PageLayout): regions = [] if len(page_layout.regions) < 2: return page_layout rotation = SmartRegionSorter.get_rotation( max(*page_layout.regions, key=lambda reg: len(reg.lines)).lines) page_layout = SmartRegionSorter.rotate_page_layout( page_layout, -rotation) for region in page_layout.regions: regions.append(Region(region)) regions = CoupledRegions(regions, intersect_param=self.intersect_param) regions.divide_and_order() # get ordered region IDs ordered_ids = regions.get_ordered_ids() # substitute every region with region_idxs = [ next((idx for idx, region in enumerate(page_layout.regions) if region.id == region_id)) for region_id in ordered_ids ] page_layout.regions = [page_layout.regions[idx] for idx in region_idxs] page_layout = SmartRegionSorter.rotate_page_layout( page_layout, rotation) return page_layout
def process_page(self, image, page_layout: PageLayout): regions = [] for region in page_layout.regions: regions.append(Region(region)) eps = image.shape[1] // self.width_denom order = NaiveRegionSorter.sort_regions(regions, eps) page_layout.regions = [page_layout.regions[idx] for idx in order] return page_layout
def __call__(self, page_layout: PageLayout, file_id): with self.env_out.begin(write=True) as txn_out: c_out = txn_out.cursor() all_lines = list(page_layout.lines_iterator()) all_lines = sorted(all_lines, key=lambda x: x.id) for line in all_lines: if line.transcription: key = f'{file_id}-{line.id}.jpg' img = cv2.imencode( '.jpg', line.crop.astype(np.uint8), [int(cv2.IMWRITE_JPEG_QUALITY), 95])[1].tobytes() print(key, line.transcription, file=self.file) self.data_size += len(img) c_out.put(key.encode(), img)
def __call__(self, page_layout: PageLayout, file_id): all_lines = list(page_layout.lines_iterator()) all_lines = sorted(all_lines, key=lambda x: x.id) records_to_write = {} for line in all_lines: if line.transcription: key = f'{file_id}-{line.id}.jpg' img = cv2.imencode( '.jpg', line.crop.astype(np.uint8), [int(cv2.IMWRITE_JPEG_QUALITY), 95])[1].tobytes() records_to_write[key] = img with self.env_out.begin(write=True) as txn_out: c_out = txn_out.cursor() for key in records_to_write: c_out.put(key.encode(), records_to_write[key])
def main(): args = get_args() start_time = time.time() config = configparser.ConfigParser() if args.config is not None: config.read(args.config) else: config.read('config.ini') Path(config['SETTINGS']['engines_path']).mkdir(parents=True, exist_ok=True) if args.api_key is not None: config["SETTINGS"]['api_key'] = args.api_key if args.engine is not None: config["SETTINGS"]['preferred_engine'] = args.preferred_engine arabic_helper = ArabicHelper() with requests.Session() as session: headers = {'api-key': config['SETTINGS']['api_key']} page_parser, engine_name, engine_version = get_engine( config, headers, config["SETTINGS"]['preferred_engine']) while True: if args.time_limit > 0 and args.time_limit * 3600 < time.time( ) - start_time: break try: r = session.get(join_url( config['SERVER']['base_url'], config['SERVER']['get_processing_request'], config['SETTINGS']['preferred_engine']), headers=headers) except requests.exceptions.ConnectionError: status = 'failed' else: if r.status_code == 200: request = r.json() status = request['status'] else: status = 'failed' if status == 'success': page_id = request['page_id'] page_url = request['page_url'] engine_id = request['engine_id'] if engine_id != int(config['SETTINGS']['preferred_engine']): page_parser, engine_name, engine_version = get_engine( config, headers, engine_id) config['SETTINGS']['preferred_engine'] = str(engine_id) # Download image from url. try: req = Request(page_url) req.add_header( 'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11' ) req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ) if config['SERVER']['base_url'] in page_url: req.add_header('api-key', config['SETTINGS']['api_key']) page = urlopen(req).read() except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except: exception = traceback.format_exc() headers = { 'api-key': config['SETTINGS']['api_key'], 'type': 'NOT_FOUND', 'engine-version': engine_version } session.post(join_url( config['SERVER']['base_url'], config['SERVER']['post_failed_processing'], page_id), data=exception.encode('utf-8'), headers=headers) continue # Decode image try: encoded_img = np.frombuffer(page, dtype=np.uint8) image = cv2.imdecode(encoded_img, flags=cv2.IMREAD_ANYCOLOR) if len(image.shape) == 2: image = np.stack([image, image, image], axis=2) except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except: exception = traceback.format_exc() headers = { 'api-key': config['SETTINGS']['api_key'], 'type': 'INVALID_FILE', 'engine-version': engine_version } session.post(join_url( config['SERVER']['base_url'], config['SERVER']['post_failed_processing'], page_id), data=exception, headers=headers) continue # Process image try: page_layout = PageLayout(id=page_id, page_size=(image.shape[0], image.shape[1])) page_layout = page_parser.process_page(image, page_layout) except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except: exception = traceback.format_exc() headers = { 'api-key': config['SETTINGS']['api_key'], 'type': 'PROCESSING_FAILED', 'engine-version': engine_version, 'hostname': socket.gethostname(), 'ip-address': socket.gethostbyname(socket.gethostname()) } session.post(join_url( config['SERVER']['base_url'], config['SERVER']['post_failed_processing'], page_id), data=exception, headers=headers) continue else: ocr_processing = create_ocr_processing_element( id="IdOcr", software_creator_str="Project PERO", software_name_str="{}".format(engine_name), software_version_str="{}".format(engine_version), processing_datetime=None) alto_xml = page_layout.to_altoxml_string( ocr_processing=ocr_processing, min_line_confidence=args.min_confidence) if args.min_confidence > 0: for region in page_layout.regions: region.lines = \ [l for l in region.lines if l.transcription_confidence and l.transcription_confidence > args.min_confidence] for line in page_layout.lines_iterator(): if arabic_helper.is_arabic_line(line.transcription): line.transcription = arabic_helper.label_form_to_string( line.transcription) page_xml = page_layout.to_pagexml_string() text = get_page_layout_text(page_layout) if args.test_mode: with open( os.path.join(args.test_path, '{}_alto.xml'.format(page_id)), "w") as file: file.write(alto_xml) with open( os.path.join(args.test_path, '{}_page.xml'.format(page_id)), "w") as file: file.write(page_xml) with open( os.path.join(args.test_path, '{}.txt'.format(page_id)), "w") as file: file.write(text) else: headers = { 'api-key': config['SETTINGS']['api_key'], 'engine-version': engine_version, 'score': str(get_score(page_layout)) } session.post( join_url(config['SERVER']['base_url'], config['SERVER']['post_upload_results'], page_id), files={ 'alto': ('{}_alto.xml'.format(page_id), alto_xml, 'text/plain'), 'page': ('{}_page.xml'.format(page_id), page_xml, 'text/plain'), 'txt': ('{}.txt'.format(page_id), text, 'text/plain') }, headers=headers) else: if args.exit_on_done: break time.sleep(10)
def __call__(self, image_file_name, file_id, index, ids_count): print(f"Processing {file_id}") t1 = time.time() annotations = [] try: if self.input_image_path is not None: image = cv2.imread( os.path.join(self.input_image_path, image_file_name), 1) if image is None: raise Exception( f'Unable to read image "{os.path.join(self.input_image_path, image_file_name)}"' ) else: image = None if self.input_xml_path: page_layout = PageLayout( file=os.path.join(self.input_xml_path, file_id + '.xml')) else: page_layout = PageLayout(id=file_id, page_size=(image.shape[0], image.shape[1])) if self.input_logit_path is not None: page_layout.load_logits( os.path.join(self.input_logit_path, file_id + '.logits')) page_layout = self.page_parser.process_page(image, page_layout) if self.output_xml_path is not None: page_layout.to_pagexml( os.path.join(self.output_xml_path, file_id + '.xml')) if self.output_render_path is not None: page_layout.render_to_image(image) cv2.imwrite( os.path.join(self.output_render_path, file_id + '.jpg'), image, [int(cv2.IMWRITE_JPEG_QUALITY), 70]) if self.output_logit_path is not None: page_layout.save_logits( os.path.join(self.output_logit_path, file_id + '.logits')) if self.output_alto_path is not None: page_layout.to_altoxml( os.path.join(self.output_alto_path, file_id + '.xml')) if self.output_line_path is not None and page_layout is not None: if 'lmdb' in self.output_line_path: lmdb_writer = LMDB_writer(self.output_line_path) lmdb_writer(page_layout, file_id) else: for region in page_layout.regions: for line in region.lines: cv2.imwrite( os.path.join(self.output_line_path, f'{file_id}-{line.id}.jpg'), line.crop.astype(np.uint8), [int(cv2.IMWRITE_JPEG_QUALITY), 98]) all_lines = list(page_layout.lines_iterator()) all_lines = sorted(all_lines, key=lambda x: x.id) annotations = [] for line in all_lines: if line.transcription: key = f'{file_id}-{line.id}.jpg' annotations.append(key + " " + line.transcription) except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except Exception as e: print(f'ERROR: Failed to process file {file_id}.') print(e) traceback.print_exc() print( "DONE {current}/{total} ({percentage:.2f} %) [id: {file_id}] Time:{time:.2f}" .format(current=index + 1, total=ids_count, percentage=(index + 1) / ids_count * 100, file_id=file_id, time=time.time() - t1)) return annotations
def main(): # initialize some parameters args = parse_arguments() config_path = args.config skip_already_processed_files = args.skip_processed os.environ[ 'TF_CPP_MIN_LOG_LEVEL'] = '3' # suppress tensorflow warnings on loading models if args.set_gpu: utils.setGPU() config = configparser.ConfigParser() config.read(config_path) if args.input_image_path is not None: config['PARSE_FOLDER']['INPUT_IMAGE_PATH'] = args.input_image_path if args.input_xml_path is not None: config['PARSE_FOLDER']['INPUT_XML_PATH'] = args.input_xml_path if args.output_xml_path is not None: config['PARSE_FOLDER']['OUTPUT_XML_PATH'] = args.output_xml_path if args.output_render_path is not None: config['PARSE_FOLDER']['OUTPUT_RENDER_PATH'] = args.output_render_path if args.output_line_path is not None: config['PARSE_FOLDER']['OUTPUT_LINE_PATH'] = args.output_line_path if args.output_logit_path is not None: config['PARSE_FOLDER']['OUTPUT_LOGIT_PATH'] = args.output_logit_path if args.output_alto_path is not None: config['PARSE_FOLDER']['OUTPUT_ALTO_PATH'] = args.output_alto_path page_parser = PageParser(config, config_path=os.path.dirname(config_path)) input_image_path = get_value_or_none(config, 'PARSE_FOLDER', 'INPUT_IMAGE_PATH') input_xml_path = get_value_or_none(config, 'PARSE_FOLDER', 'INPUT_XML_PATH') input_logit_path = get_value_or_none(config, 'PARSE_FOLDER', 'INPUT_LOGIT_PATH') output_render_path = get_value_or_none(config, 'PARSE_FOLDER', 'OUTPUT_RENDER_PATH') output_line_path = get_value_or_none(config, 'PARSE_FOLDER', 'OUTPUT_LINE_PATH') output_xml_path = get_value_or_none(config, 'PARSE_FOLDER', 'OUTPUT_XML_PATH') output_logit_path = get_value_or_none(config, 'PARSE_FOLDER', 'OUTPUT_LOGIT_PATH') output_alto_path = get_value_or_none(config, 'PARSE_FOLDER', 'OUTPUT_ALTO_PATH') if output_line_path is not None and 'lmdb' in output_line_path: lmdb_writer = LMDB_writer(output_line_path) else: lmdb_writer = None if output_render_path is not None: create_dir_if_not_exists(output_render_path) if output_line_path is not None: create_dir_if_not_exists(output_line_path) if output_xml_path is not None: create_dir_if_not_exists(output_xml_path) if output_logit_path is not None: create_dir_if_not_exists(output_logit_path) if output_alto_path is not None: create_dir_if_not_exists(output_alto_path) if input_logit_path is not None and input_xml_path is None: input_logit_path = None print( 'Warning: Logit path specified and Page XML path not specified. Logits will be ignored.' ) if input_image_path is not None: print(f'Reading images from {input_image_path}.') images_to_process = [ f for f in os.listdir(input_image_path) if os.path.splitext(f) [1].lower() in ['.jpg', '.jpeg', '.png', '.tif'] ] ids_to_process = [ os.path.splitext(os.path.basename(file))[0] for file in images_to_process ] elif input_xml_path is not None: print(f'Reading page xml from {input_xml_path}') xml_to_process = [ f for f in os.listdir(input_xml_path) if os.path.splitext(f)[1] == '.xml' ] images_to_process = [None] * len(xml_to_process) ids_to_process = [ os.path.splitext(os.path.basename(file))[0] for file in xml_to_process ] else: raise Exception( f'Either INPUT_IMAGE_PATH or INPUT_XML_PATH has to be specified. Both are missing in {config_path}.' ) if skip_already_processed_files: # Files already processed are skipped. File is considered as already processed when file with appropriate # extension is found in all required output directories. If any of the output paths is set to 'None' # (i.e. the output is not required) than this directory is omitted. already_processed_files = load_already_processed_files( [output_xml_path, output_logit_path, output_render_path]) if len(already_processed_files) > 0: print(f"Already processed {len(already_processed_files)} file(s).") images_to_process = [ image for id, image in zip(ids_to_process, images_to_process) if id not in already_processed_files ] ids_to_process = [ id for id in ids_to_process if id not in already_processed_files ] for index, (file_id, image_file_name) in enumerate( zip(ids_to_process, images_to_process)): print("Processing {file_id}".format(file_id=file_id)) t1 = time.time() try: if input_image_path is not None: image = cv2.imread( os.path.join(input_image_path, image_file_name), 1) if image is None: raise Exception( f'Unable to read image "{os.path.join(input_image_path, image_file_name)}"' ) else: image = None if input_xml_path: page_layout = PageLayout( file=os.path.join(input_xml_path, file_id + '.xml')) else: page_layout = PageLayout(id=file_id, page_size=(image.shape[0], image.shape[1])) if input_logit_path is not None: page_layout.load_logits( os.path.join(input_logit_path, file_id + '.logits')) page_layout = page_parser.process_page(image, page_layout) if output_xml_path is not None: page_layout.to_pagexml( os.path.join(output_xml_path, file_id + '.xml')) if output_render_path is not None: page_layout.render_to_image(image) cv2.imwrite(os.path.join(output_render_path, file_id + '.jpg'), image, [int(cv2.IMWRITE_JPEG_QUALITY), 70]) if output_logit_path is not None: page_layout.save_logits( os.path.join(output_logit_path, file_id + '.logits')) if output_alto_path is not None: page_layout.to_altoxml( os.path.join(output_alto_path, file_id + '.xml')) if output_line_path is not None: if lmdb_writer: lmdb_writer(page_layout, file_id) else: for region in page_layout.regions: for line in region.lines: cv2.imwrite( os.path.join(output_line_path, f'{file_id}-{line.id}.jpg'), line.crop.astype(np.uint8), [int(cv2.IMWRITE_JPEG_QUALITY), 98]) except KeyboardInterrupt: traceback.print_exc() print('Terminated by user.') sys.exit() except Exception as e: print(f'ERROR: Failed to process file {file_id}.') print(e) traceback.print_exc() print( "DONE {current}/{total} ({percentage:.2f} %) [id: {file_id}] Time:{time:.2f}" .format(current=index + 1, total=len(ids_to_process), percentage=(index + 1) / len(ids_to_process) * 100, file_id=file_id, time=time.time() - t1))