Beispiel #1
0
def main():
    args = parse_arguments()

    create_dir_if_not_exists(args.output_path)

    input_paths = args.input_paths

    files_to_process = [
        f for f in os.listdir(input_paths[0])
        if os.path.splitext(f)[1].lower() == '.xml'
    ]

    print('input_paths', input_paths)

    arabic_helper = ArabicHelper()

    for xml_file_name in files_to_process:
        print(xml_file_name)
        input_layouts = []
        for input_path in input_paths:
            try:
                page_layout = PageLayout(
                    file=os.path.join(input_path, xml_file_name))
                page_layout.load_logits(
                    os.path.join(
                        input_path,
                        os.path.splitext(xml_file_name)[0] + '.logits'))
                input_layouts.append(page_layout)
            except KeyboardInterrupt:
                traceback.print_exc()
                print('Terminated by user.')
                sys.exit()
            except Exception as e:
                print(
                    f'ERROR: Failed to load Page XML or .logit file "{xml_file_name}" from "{input_path}".'
                )
                print(e)
                traceback.print_exc()

        merge_layouts(input_layouts)
        merged_layout = input_layouts[0]

        if args.min_confidence > 0:
            for region in merged_layout.regions:
                region.lines = \
                    [l for l in region.lines if l.transcription_confidence and l.transcription_confidence > args.min_confidence]

        if args.fix_arabic_order:
            for line in merged_layout.lines_iterator():
                if arabic_helper.is_arabic_line(line.transcription):
                    line.transcription = arabic_helper.label_form_to_string(
                        line.transcription)

        merged_layout.to_pagexml(os.path.join(args.output_path, xml_file_name))
        merged_layout.save_logits(
            os.path.join(args.output_path,
                         os.path.splitext(xml_file_name)[0] + '.logits'))
Beispiel #2
0
 def process_page(self, img: np.ndarray, page_layout: PageLayout):
     polygons = SimpleThresholdRegion._compute_layout(img)
     page_layout.regions = [
         RegionLayout(f'r-{idx}', polygon[:, ::-1])
         for idx, polygon in enumerate(polygons)
     ]
     return page_layout
Beispiel #3
0
def read_page_xml(path):
    try:
        page_layout = PageLayout(file=path)
    except:
        print(f'Warning: unable to load page xml "{path}"')
        return None
    return page_layout
Beispiel #4
0
    def process_page(self, image, page_layout: PageLayout):
        regions = []

        if len(page_layout.regions) < 2:
            return page_layout

        rotation = SmartRegionSorter.get_rotation(
            max(*page_layout.regions, key=lambda reg: len(reg.lines)).lines)
        page_layout = SmartRegionSorter.rotate_page_layout(
            page_layout, -rotation)

        for region in page_layout.regions:
            regions.append(Region(region))

        regions = CoupledRegions(regions, intersect_param=self.intersect_param)
        regions.divide_and_order()

        # get ordered region IDs
        ordered_ids = regions.get_ordered_ids()

        # substitute every region with
        region_idxs = [
            next((idx for idx, region in enumerate(page_layout.regions)
                  if region.id == region_id)) for region_id in ordered_ids
        ]

        page_layout.regions = [page_layout.regions[idx] for idx in region_idxs]
        page_layout = SmartRegionSorter.rotate_page_layout(
            page_layout, rotation)

        return page_layout
Beispiel #5
0
    def process_page(self, image, page_layout: PageLayout):
        regions = []

        for region in page_layout.regions:
            regions.append(Region(region))

        eps = image.shape[1] // self.width_denom
        order = NaiveRegionSorter.sort_regions(regions, eps)

        page_layout.regions = [page_layout.regions[idx] for idx in order]

        return page_layout
Beispiel #6
0
 def __call__(self, page_layout: PageLayout, file_id):
     with self.env_out.begin(write=True) as txn_out:
         c_out = txn_out.cursor()
         all_lines = list(page_layout.lines_iterator())
         all_lines = sorted(all_lines, key=lambda x: x.id)
         for line in all_lines:
             if line.transcription:
                 key = f'{file_id}-{line.id}.jpg'
                 img = cv2.imencode(
                     '.jpg', line.crop.astype(np.uint8),
                     [int(cv2.IMWRITE_JPEG_QUALITY), 95])[1].tobytes()
                 print(key, line.transcription, file=self.file)
                 self.data_size += len(img)
                 c_out.put(key.encode(), img)
Beispiel #7
0
    def __call__(self, page_layout: PageLayout, file_id):
        all_lines = list(page_layout.lines_iterator())
        all_lines = sorted(all_lines, key=lambda x: x.id)
        records_to_write = {}
        for line in all_lines:
            if line.transcription:
                key = f'{file_id}-{line.id}.jpg'
                img = cv2.imencode(
                    '.jpg', line.crop.astype(np.uint8),
                    [int(cv2.IMWRITE_JPEG_QUALITY), 95])[1].tobytes()
                records_to_write[key] = img

        with self.env_out.begin(write=True) as txn_out:
            c_out = txn_out.cursor()
            for key in records_to_write:
                c_out.put(key.encode(), records_to_write[key])
Beispiel #8
0
def main():
    args = get_args()

    start_time = time.time()

    config = configparser.ConfigParser()
    if args.config is not None:
        config.read(args.config)
    else:
        config.read('config.ini')

    Path(config['SETTINGS']['engines_path']).mkdir(parents=True, exist_ok=True)

    if args.api_key is not None:
        config["SETTINGS"]['api_key'] = args.api_key

    if args.engine is not None:
        config["SETTINGS"]['preferred_engine'] = args.preferred_engine

    arabic_helper = ArabicHelper()
    with requests.Session() as session:
        headers = {'api-key': config['SETTINGS']['api_key']}
        page_parser, engine_name, engine_version = get_engine(
            config, headers, config["SETTINGS"]['preferred_engine'])

        while True:
            if args.time_limit > 0 and args.time_limit * 3600 < time.time(
            ) - start_time:
                break

            try:
                r = session.get(join_url(
                    config['SERVER']['base_url'],
                    config['SERVER']['get_processing_request'],
                    config['SETTINGS']['preferred_engine']),
                                headers=headers)
            except requests.exceptions.ConnectionError:
                status = 'failed'
            else:
                if r.status_code == 200:
                    request = r.json()
                    status = request['status']
                else:
                    status = 'failed'

            if status == 'success':
                page_id = request['page_id']
                page_url = request['page_url']
                engine_id = request['engine_id']
                if engine_id != int(config['SETTINGS']['preferred_engine']):
                    page_parser, engine_name, engine_version = get_engine(
                        config, headers, engine_id)
                    config['SETTINGS']['preferred_engine'] = str(engine_id)

                # Download image from url.
                try:
                    req = Request(page_url)
                    req.add_header(
                        'User-Agent',
                        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
                    )
                    req.add_header(
                        'Accept',
                        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
                    )
                    if config['SERVER']['base_url'] in page_url:
                        req.add_header('api-key',
                                       config['SETTINGS']['api_key'])
                    page = urlopen(req).read()
                except KeyboardInterrupt:
                    traceback.print_exc()
                    print('Terminated by user.')
                    sys.exit()
                except:
                    exception = traceback.format_exc()
                    headers = {
                        'api-key': config['SETTINGS']['api_key'],
                        'type': 'NOT_FOUND',
                        'engine-version': engine_version
                    }
                    session.post(join_url(
                        config['SERVER']['base_url'],
                        config['SERVER']['post_failed_processing'], page_id),
                                 data=exception.encode('utf-8'),
                                 headers=headers)
                    continue

                # Decode image
                try:
                    encoded_img = np.frombuffer(page, dtype=np.uint8)
                    image = cv2.imdecode(encoded_img,
                                         flags=cv2.IMREAD_ANYCOLOR)
                    if len(image.shape) == 2:
                        image = np.stack([image, image, image], axis=2)
                except KeyboardInterrupt:
                    traceback.print_exc()
                    print('Terminated by user.')
                    sys.exit()
                except:
                    exception = traceback.format_exc()
                    headers = {
                        'api-key': config['SETTINGS']['api_key'],
                        'type': 'INVALID_FILE',
                        'engine-version': engine_version
                    }
                    session.post(join_url(
                        config['SERVER']['base_url'],
                        config['SERVER']['post_failed_processing'], page_id),
                                 data=exception,
                                 headers=headers)
                    continue

                # Process image
                try:
                    page_layout = PageLayout(id=page_id,
                                             page_size=(image.shape[0],
                                                        image.shape[1]))
                    page_layout = page_parser.process_page(image, page_layout)

                except KeyboardInterrupt:
                    traceback.print_exc()
                    print('Terminated by user.')
                    sys.exit()
                except:
                    exception = traceback.format_exc()
                    headers = {
                        'api-key': config['SETTINGS']['api_key'],
                        'type': 'PROCESSING_FAILED',
                        'engine-version': engine_version,
                        'hostname': socket.gethostname(),
                        'ip-address':
                        socket.gethostbyname(socket.gethostname())
                    }
                    session.post(join_url(
                        config['SERVER']['base_url'],
                        config['SERVER']['post_failed_processing'], page_id),
                                 data=exception,
                                 headers=headers)
                    continue
                else:
                    ocr_processing = create_ocr_processing_element(
                        id="IdOcr",
                        software_creator_str="Project PERO",
                        software_name_str="{}".format(engine_name),
                        software_version_str="{}".format(engine_version),
                        processing_datetime=None)

                    alto_xml = page_layout.to_altoxml_string(
                        ocr_processing=ocr_processing,
                        min_line_confidence=args.min_confidence)

                    if args.min_confidence > 0:
                        for region in page_layout.regions:
                            region.lines = \
                                [l for l in region.lines if l.transcription_confidence and l.transcription_confidence > args.min_confidence]

                    for line in page_layout.lines_iterator():
                        if arabic_helper.is_arabic_line(line.transcription):
                            line.transcription = arabic_helper.label_form_to_string(
                                line.transcription)
                    page_xml = page_layout.to_pagexml_string()
                    text = get_page_layout_text(page_layout)

                    if args.test_mode:
                        with open(
                                os.path.join(args.test_path,
                                             '{}_alto.xml'.format(page_id)),
                                "w") as file:
                            file.write(alto_xml)
                        with open(
                                os.path.join(args.test_path,
                                             '{}_page.xml'.format(page_id)),
                                "w") as file:
                            file.write(page_xml)
                        with open(
                                os.path.join(args.test_path,
                                             '{}.txt'.format(page_id)),
                                "w") as file:
                            file.write(text)
                    else:
                        headers = {
                            'api-key': config['SETTINGS']['api_key'],
                            'engine-version': engine_version,
                            'score': str(get_score(page_layout))
                        }
                        session.post(
                            join_url(config['SERVER']['base_url'],
                                     config['SERVER']['post_upload_results'],
                                     page_id),
                            files={
                                'alto': ('{}_alto.xml'.format(page_id),
                                         alto_xml, 'text/plain'),
                                'page': ('{}_page.xml'.format(page_id),
                                         page_xml, 'text/plain'),
                                'txt':
                                ('{}.txt'.format(page_id), text, 'text/plain')
                            },
                            headers=headers)

            else:
                if args.exit_on_done:
                    break
                time.sleep(10)
Beispiel #9
0
    def __call__(self, image_file_name, file_id, index, ids_count):
        print(f"Processing {file_id}")
        t1 = time.time()
        annotations = []
        try:
            if self.input_image_path is not None:
                image = cv2.imread(
                    os.path.join(self.input_image_path, image_file_name), 1)
                if image is None:
                    raise Exception(
                        f'Unable to read image "{os.path.join(self.input_image_path, image_file_name)}"'
                    )
            else:
                image = None

            if self.input_xml_path:
                page_layout = PageLayout(
                    file=os.path.join(self.input_xml_path, file_id + '.xml'))
            else:
                page_layout = PageLayout(id=file_id,
                                         page_size=(image.shape[0],
                                                    image.shape[1]))

            if self.input_logit_path is not None:
                page_layout.load_logits(
                    os.path.join(self.input_logit_path, file_id + '.logits'))

            page_layout = self.page_parser.process_page(image, page_layout)

            if self.output_xml_path is not None:
                page_layout.to_pagexml(
                    os.path.join(self.output_xml_path, file_id + '.xml'))

            if self.output_render_path is not None:
                page_layout.render_to_image(image)
                cv2.imwrite(
                    os.path.join(self.output_render_path, file_id + '.jpg'),
                    image, [int(cv2.IMWRITE_JPEG_QUALITY), 70])

            if self.output_logit_path is not None:
                page_layout.save_logits(
                    os.path.join(self.output_logit_path, file_id + '.logits'))

            if self.output_alto_path is not None:
                page_layout.to_altoxml(
                    os.path.join(self.output_alto_path, file_id + '.xml'))

            if self.output_line_path is not None and page_layout is not None:
                if 'lmdb' in self.output_line_path:
                    lmdb_writer = LMDB_writer(self.output_line_path)
                    lmdb_writer(page_layout, file_id)
                else:
                    for region in page_layout.regions:
                        for line in region.lines:
                            cv2.imwrite(
                                os.path.join(self.output_line_path,
                                             f'{file_id}-{line.id}.jpg'),
                                line.crop.astype(np.uint8),
                                [int(cv2.IMWRITE_JPEG_QUALITY), 98])

            all_lines = list(page_layout.lines_iterator())
            all_lines = sorted(all_lines, key=lambda x: x.id)
            annotations = []
            for line in all_lines:
                if line.transcription:
                    key = f'{file_id}-{line.id}.jpg'
                    annotations.append(key + " " + line.transcription)

        except KeyboardInterrupt:
            traceback.print_exc()
            print('Terminated by user.')
            sys.exit()
        except Exception as e:
            print(f'ERROR: Failed to process file {file_id}.')
            print(e)
            traceback.print_exc()
        print(
            "DONE {current}/{total} ({percentage:.2f} %) [id: {file_id}] Time:{time:.2f}"
            .format(current=index + 1,
                    total=ids_count,
                    percentage=(index + 1) / ids_count * 100,
                    file_id=file_id,
                    time=time.time() - t1))

        return annotations
Beispiel #10
0
def main():
    # initialize some parameters
    args = parse_arguments()
    config_path = args.config
    skip_already_processed_files = args.skip_processed

    os.environ[
        'TF_CPP_MIN_LOG_LEVEL'] = '3'  # suppress tensorflow warnings on loading models

    if args.set_gpu:
        utils.setGPU()

    config = configparser.ConfigParser()
    config.read(config_path)

    if args.input_image_path is not None:
        config['PARSE_FOLDER']['INPUT_IMAGE_PATH'] = args.input_image_path
    if args.input_xml_path is not None:
        config['PARSE_FOLDER']['INPUT_XML_PATH'] = args.input_xml_path
    if args.output_xml_path is not None:
        config['PARSE_FOLDER']['OUTPUT_XML_PATH'] = args.output_xml_path
    if args.output_render_path is not None:
        config['PARSE_FOLDER']['OUTPUT_RENDER_PATH'] = args.output_render_path
    if args.output_line_path is not None:
        config['PARSE_FOLDER']['OUTPUT_LINE_PATH'] = args.output_line_path
    if args.output_logit_path is not None:
        config['PARSE_FOLDER']['OUTPUT_LOGIT_PATH'] = args.output_logit_path
    if args.output_alto_path is not None:
        config['PARSE_FOLDER']['OUTPUT_ALTO_PATH'] = args.output_alto_path

    page_parser = PageParser(config, config_path=os.path.dirname(config_path))

    input_image_path = get_value_or_none(config, 'PARSE_FOLDER',
                                         'INPUT_IMAGE_PATH')
    input_xml_path = get_value_or_none(config, 'PARSE_FOLDER',
                                       'INPUT_XML_PATH')
    input_logit_path = get_value_or_none(config, 'PARSE_FOLDER',
                                         'INPUT_LOGIT_PATH')

    output_render_path = get_value_or_none(config, 'PARSE_FOLDER',
                                           'OUTPUT_RENDER_PATH')
    output_line_path = get_value_or_none(config, 'PARSE_FOLDER',
                                         'OUTPUT_LINE_PATH')
    output_xml_path = get_value_or_none(config, 'PARSE_FOLDER',
                                        'OUTPUT_XML_PATH')
    output_logit_path = get_value_or_none(config, 'PARSE_FOLDER',
                                          'OUTPUT_LOGIT_PATH')
    output_alto_path = get_value_or_none(config, 'PARSE_FOLDER',
                                         'OUTPUT_ALTO_PATH')

    if output_line_path is not None and 'lmdb' in output_line_path:
        lmdb_writer = LMDB_writer(output_line_path)
    else:
        lmdb_writer = None

    if output_render_path is not None:
        create_dir_if_not_exists(output_render_path)
    if output_line_path is not None:
        create_dir_if_not_exists(output_line_path)
    if output_xml_path is not None:
        create_dir_if_not_exists(output_xml_path)
    if output_logit_path is not None:
        create_dir_if_not_exists(output_logit_path)
    if output_alto_path is not None:
        create_dir_if_not_exists(output_alto_path)

    if input_logit_path is not None and input_xml_path is None:
        input_logit_path = None
        print(
            'Warning: Logit path specified and Page XML path not specified. Logits will be ignored.'
        )

    if input_image_path is not None:
        print(f'Reading images from {input_image_path}.')
        images_to_process = [
            f for f in os.listdir(input_image_path) if os.path.splitext(f)
            [1].lower() in ['.jpg', '.jpeg', '.png', '.tif']
        ]
        ids_to_process = [
            os.path.splitext(os.path.basename(file))[0]
            for file in images_to_process
        ]
    elif input_xml_path is not None:
        print(f'Reading page xml from {input_xml_path}')
        xml_to_process = [
            f for f in os.listdir(input_xml_path)
            if os.path.splitext(f)[1] == '.xml'
        ]
        images_to_process = [None] * len(xml_to_process)
        ids_to_process = [
            os.path.splitext(os.path.basename(file))[0]
            for file in xml_to_process
        ]
    else:
        raise Exception(
            f'Either INPUT_IMAGE_PATH or INPUT_XML_PATH has to be specified. Both are missing in {config_path}.'
        )

    if skip_already_processed_files:
        # Files already processed are skipped. File is considered as already processed when file with appropriate
        # extension is found in all required output directories. If any of the output paths is set to 'None'
        # (i.e. the output is not required) than this directory is omitted.
        already_processed_files = load_already_processed_files(
            [output_xml_path, output_logit_path, output_render_path])
        if len(already_processed_files) > 0:
            print(f"Already processed {len(already_processed_files)} file(s).")

            images_to_process = [
                image for id, image in zip(ids_to_process, images_to_process)
                if id not in already_processed_files
            ]
            ids_to_process = [
                id for id in ids_to_process
                if id not in already_processed_files
            ]

    for index, (file_id, image_file_name) in enumerate(
            zip(ids_to_process, images_to_process)):
        print("Processing {file_id}".format(file_id=file_id))
        t1 = time.time()
        try:
            if input_image_path is not None:
                image = cv2.imread(
                    os.path.join(input_image_path, image_file_name), 1)
                if image is None:
                    raise Exception(
                        f'Unable to read image "{os.path.join(input_image_path, image_file_name)}"'
                    )
            else:
                image = None

            if input_xml_path:
                page_layout = PageLayout(
                    file=os.path.join(input_xml_path, file_id + '.xml'))
            else:
                page_layout = PageLayout(id=file_id,
                                         page_size=(image.shape[0],
                                                    image.shape[1]))

            if input_logit_path is not None:
                page_layout.load_logits(
                    os.path.join(input_logit_path, file_id + '.logits'))

            page_layout = page_parser.process_page(image, page_layout)

            if output_xml_path is not None:
                page_layout.to_pagexml(
                    os.path.join(output_xml_path, file_id + '.xml'))

            if output_render_path is not None:
                page_layout.render_to_image(image)
                cv2.imwrite(os.path.join(output_render_path, file_id + '.jpg'),
                            image, [int(cv2.IMWRITE_JPEG_QUALITY), 70])

            if output_logit_path is not None:
                page_layout.save_logits(
                    os.path.join(output_logit_path, file_id + '.logits'))

            if output_alto_path is not None:
                page_layout.to_altoxml(
                    os.path.join(output_alto_path, file_id + '.xml'))

            if output_line_path is not None:
                if lmdb_writer:
                    lmdb_writer(page_layout, file_id)
                else:
                    for region in page_layout.regions:
                        for line in region.lines:
                            cv2.imwrite(
                                os.path.join(output_line_path,
                                             f'{file_id}-{line.id}.jpg'),
                                line.crop.astype(np.uint8),
                                [int(cv2.IMWRITE_JPEG_QUALITY), 98])

        except KeyboardInterrupt:
            traceback.print_exc()
            print('Terminated by user.')
            sys.exit()
        except Exception as e:
            print(f'ERROR: Failed to process file {file_id}.')
            print(e)
            traceback.print_exc()
        print(
            "DONE {current}/{total} ({percentage:.2f} %) [id: {file_id}] Time:{time:.2f}"
            .format(current=index + 1,
                    total=len(ids_to_process),
                    percentage=(index + 1) / len(ids_to_process) * 100,
                    file_id=file_id,
                    time=time.time() - t1))