Exemple #1
0
    def test_should_pass_through_return_value_if_no_exception_was_raised(self):
        def fn(x):
            return x.upper()

        with TestPipeline() as p:
            result = (p | beam.Create([SOME_VALUE_1]) | MapOrLog(SOME_FN))
            assert_that(result, equal_to([fn(SOME_VALUE_1)]))
Exemple #2
0
 def test_should_increase_error_metric_counter_if_exception_was_raised(
         self):
     with TestPipeline() as p:
         _ = (  # noqa: F841
             p | beam.Create([SOME_VALUE_1]) | MapOrLog(
                 FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME))
         assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1
Exemple #3
0
def add_read_source_to_extracted_xml_pipeline_steps(p, opt,
                                                    get_pipeline_output_file):
    if opt.lxml_file_list:
        lxml_urls = p | ReadFileList(
            opt.lxml_file_list, column=opt.lxml_file_column, limit=opt.limit)

        annotated_lxml = (lxml_urls | PreventFusion(
        ) | "ReadLxmlContent" >> TransformAndCount(
            MapOrLog(
                lambda url: {
                    DataProps.SOURCE_FILENAME: url,
                    DataProps.STRUCTURED_DOCUMENT: load_structured_document(url
                                                                            )
                },
                error_count=MetricCounters.READ_LXML_ERROR),
            MetricCounters.FILES))

        extract_tag_scope = None
    else:
        annotated_lxml, extract_tag_scope = add_read_pdfs_to_annotated_lxml_pipeline_steps(
            p, opt, get_pipeline_output_file)

    extracted_xml = (annotated_lxml | "ExtractToXml" >> MapOrLog(
        lambda v: remove_keys_from_dict(extend_dict(
            v, {
                DataProps.EXTRACTED_XML:
                extract_annotated_structured_document_to_xml(
                    v[DataProps.STRUCTURED_DOCUMENT],
                    tag_scope=extract_tag_scope)
            }),
                                        keys_to_remove=
                                        {DataProps.STRUCTURED_DOCUMENT}),
        error_count=MetricCounters.EXTRACT_TO_XML_ERROR))

    if opt.use_grobid:
        enhancer = GrobidXmlEnhancer(opt.grobid_url,
                                     start_service=opt.start_grobid_service)
        extracted_xml = (extracted_xml | "GrobidEnhanceXml" >> MapOrLog(
            lambda v: extend_dict(v, {
                DataProps.EXTRACTED_XML:
                enhancer(v[DataProps.EXTRACTED_XML])
            }),
            error_count=MetricCounters.GROBID_ERROR))
    return extracted_xml
def get_step_transform(step):
    step_name = str(step)
    return step_name >> MapOrLog(
        execute_or_skip_step(step),
        log_fn=lambda e, v: (get_logger().warning(
            'caught exception (ignoring item): %s, source file: %s, step: %s',
            e,
            v[DataProps.SOURCE_FILENAME],
            step_name,
            exc_info=e)),
        error_count=get_step_error_counter(step))
Exemple #5
0
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt):
    grobid_transformer = grobid_service(opt.grobid_url,
                                        opt.grobid_action,
                                        start_service=opt.start_grobid_service)

    return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent(
    ) | "Grobid" >> MapOrLog(lambda v: extend_dict(
        v, {
            DataProps.EXTRACTED_XML:
            grobid_transformer(
                (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1]
        }),
                             error_count=MetricCounters.GROBID_ERROR))
Exemple #6
0
 def test_should_skip_entries_that_cause_an_exception(self):
     with TestPipeline() as p:
         result = (p | beam.Create([SOME_VALUE_1])
                   | MapOrLog(FN_RAISING_EXCEPTION))
         assert_that(result, equal_to([]))
def configure_pipeline(p, opt):
    image_size = ((opt.image_width, opt.image_height)
                  if opt.image_width and opt.image_height else None)
    page_range = opt.pages
    first_page = page_range[0] if page_range else 1
    xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
    if opt.lxml_path:
        lxml_xml_file_pairs = (
            p | beam.Create(
                [[
                    join_if_relative_path(opt.base_data_path, s)
                    for s in [opt.lxml_path, opt.xml_path]
                ]]) | "FindFilePairs" >> TransformAndLog(
                    beam.FlatMap(lambda patterns: islice(
                        find_file_pairs_grouped_by_parent_directory_or_name(
                            patterns), opt.limit)),
                    log_prefix='file pairs: ',
                    log_level='debug') | PreventFusion()
            | "ReadFileContent" >> beam.Map(
                lambda filenames: {
                    'source_filename': filenames[0],
                    'xml_filename': filenames[1],
                    'lxml_content': read_all_from_path(filenames[0]),
                    'xml_content': read_all_from_path(filenames[1])
                }))
    elif opt.pdf_path or opt.pdf_xml_file_list:
        if opt.pdf_xml_file_list:
            pdf_xml_url_pairs = (
                p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list,
                                                      limit=opt.limit)
                | "TranslateFilePairUrls" >>
                beam.Map(lambda row: (row['source_url'], row['xml_url'])))
        else:
            pdf_xml_url_pairs = (p | beam.Create([[
                join_if_relative_path(opt.base_data_path, s)
                for s in [opt.pdf_path, opt.xml_path]
            ]]) | "FindFilePairs" >> TransformAndLog(
                beam.FlatMap(lambda patterns: islice(
                    find_file_pairs_grouped_by_parent_directory_or_name(
                        patterns), opt.limit)),
                log_prefix='file pairs: ',
                log_level='debug'))
        pdf_xml_file_pairs = (
            pdf_xml_url_pairs | PreventFusion()
            | "ReadFileContent" >> TransformAndCount(
                beam.Map(
                    lambda filenames: {
                        'source_filename': filenames[0],
                        'xml_filename': filenames[1],
                        'pdf_content': read_all_from_path(filenames[0]),
                        'xml_content': read_all_from_path(filenames[1])
                    }), MetricCounters.FILE_PAIR))

        lxml_xml_file_pairs = (
            pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'lxml_content':
                            convert_pdf_bytes_to_lxml(v['pdf_content'],
                                                      path=v['source_filename'
                                                             ],
                                                      page_range=page_range)
                        }),
                    # we don't need the pdf_content unless we are writing tf_records
                    None if opt.save_tfrecords else {'pdf_content'}),
                log_fn=lambda e, v: (get_logger().warning(
                    'caught exception (ignoring item): %s, pdf: %s, xml: %s',
                    e,
                    v['source_filename'],
                    v['xml_filename'],
                    exc_info=e)),
                error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))
    else:
        raise RuntimeError('either lxml-path or pdf-path required')

    if opt.save_png or opt.save_tfrecords:
        with_pdf_png_pages = (
            (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs)
            | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'pdf_png_pages':
                            list(
                                pdf_bytes_to_png_pages(v['pdf_content'],
                                                       dpi=opt.png_dpi,
                                                       image_size=image_size,
                                                       page_range=page_range))
                        }),
                    {'pdf_content'}  # we no longer need the pdf_content
                ),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR))

        if opt.save_png:
            _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.png.zip')),
                    '.png', v['pdf_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.save_lxml:
        _ = (lxml_xml_file_pairs
             | "SaveLxml" >> TransformAndLog(
                 beam.Map(lambda v: save_file_content(
                     FileSystems.join(
                         opt.output_path,
                         change_ext(
                             relative_path(opt.base_data_path, v[
                                 'source_filename']), None, '.lxml.gz')), v[
                                     'lxml_content'])),
                 log_fn=lambda x: get_logger().info('saved lxml: %s', x)))

    annotation_results = ((
        with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs
    ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount(
        MapOrLog(
            lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'svg_pages':
                        list(
                            convert_and_annotate_lxml_content(
                                v['lxml_content'],
                                v['xml_content'],
                                xml_mapping,
                                name=v['source_filename']))
                    }),
                # Won't need the XML anymore
                {'lxml_content', 'xml_content'}),
            log_fn=lambda e, v: (get_logger().warning(
                'caught exception (ignoring item): %s, source: %s, xml: %s',
                e,
                v['source_filename'],
                v['xml_filename'],
                exc_info=e)),
            error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR),
        MetricCounters.PAGE,
        lambda v: len(v['svg_pages'])))

    if opt.save_svg:
        _ = (annotation_results | "SaveSvgPages" >> TransformAndLog(
            beam.Map(lambda v: save_svg_roots(
                FileSystems.join(
                    opt.output_path,
                    change_ext(
                        relative_path(opt.base_data_path, v['source_filename']
                                      ), None, '.svg.zip')), v['svg_pages'])),
            log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.annotation_evaluation_csv or opt.min_annotation_percentage:
        annotation_evaluation_results = (
            annotation_results | "EvaluateAnnotations" >> TransformAndLog(
                beam.Map(lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'annotation_evaluation':
                            evaluate_document_by_page(
                                SvgStructuredDocument(v['svg_pages']))
                        }), None
                    if opt.min_annotation_percentage else {'svg_pages'})),
                log_fn=lambda x: get_logger().info(
                    'annotation evaluation result: %s: %s', x[
                        'source_filename'], x['annotation_evaluation'])))

    if opt.save_block_png or opt.save_tfrecords:
        color_map = parse_color_map_from_file(opt.color_map)
        with_block_png_pages = (
            (annotation_evaluation_results
             if opt.min_annotation_percentage else annotation_results)
            | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'block_png_pages': [
                            svg_page_to_blockified_png_bytes(
                                svg_page, color_map, image_size=image_size)
                            for svg_page in v['svg_pages']
                        ]
                    }), {'svg_pages'})))

        if opt.save_block_png:
            _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.block-png.zip')),
                    '.png', v['block_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

        if opt.save_tfrecords:
            if opt.min_annotation_percentage:
                filtered_pages = (
                    with_block_png_pages | "FilterPages" >> TransformAndCount(
                        beam.Map(lambda v: filter_list_props_by_indices(
                            v,
                            get_page_indices_with_min_annotation_percentage(
                                v['annotation_evaluation'], opt.
                                min_annotation_percentage),
                            {'pdf_png_pages', 'block_png_pages'})),
                        MetricCounters.FILTERED_PAGE,
                        lambda v: len(v['block_png_pages'])))
            else:
                filtered_pages = with_block_png_pages
            _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord(
                FileSystems.join(opt.output_path, 'data'), lambda v: ({
                    'input_uri':
                    v['source_filename'] + '#page%d' % (first_page + i),
                    'input_image':
                    pdf_png_page,
                    'annotation_uri':
                    (v['source_filename'] + '.annot' + '#page%d' %
                     (first_page + i)),
                    'annotation_image':
                    block_png_page,
                    'page_no':
                    first_page + i
                } for i, pdf_png_page, block_png_page in zip(
                    range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[
                        'block_png_pages']))))

    if opt.annotation_evaluation_csv:
        annotation_evaluation_csv_name, annotation_evaluation_ext = (
            os.path.splitext(opt.annotation_evaluation_csv))
        _ = (  # flake8: noqa
            annotation_evaluation_results | "FlattenAnotationEvaluationResults"
            >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows(
                v['annotation_evaluation'],
                document=basename(v['source_filename'])))
            | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv(
                join_if_relative_path(opt.output_path,
                                      annotation_evaluation_csv_name),
                file_name_suffix=annotation_evaluation_ext,
                columns=DEFAULT_EVALUATION_COLUMNS))
Exemple #8
0
def add_read_pdfs_to_annotated_lxml_pipeline_steps(p, opt,
                                                   get_pipeline_output_file):
    page_range = opt.pages

    cv_enabled = opt.cv_model_export_dir

    extract_tag_scope = None

    pdf_urls = p | PdfUrlSource(opt)

    lxml_content = (pdf_urls | PreventFusion() | ReadPdfContent()
                    | "ConvertPdfToLxml" >> MapOrLog(
                        lambda v: extend_dict(
                            v, {
                                DataProps.STRUCTURED_DOCUMENT:
                                convert_pdf_bytes_to_structured_document(
                                    v[DataProps.PDF_CONTENT],
                                    path=v[DataProps.SOURCE_FILENAME],
                                    page_range=page_range)
                            }),
                        log_fn=lambda e, v: (get_logger().warning(
                            'caught exception (ignoring item): %s, pdf: %s',
                            e,
                            v[DataProps.SOURCE_FILENAME],
                            exc_info=e)),
                        error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))

    if cv_enabled:
        image_size = ((opt.image_width, opt.image_height)
                      if opt.image_width and opt.image_height else None)
        inference_model_wrapper = InferenceModelWrapper(
            opt.cv_model_export_dir)

        cv_predictions = (
            lxml_content | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v,
                        {
                            DataProps.PDF_PNG_PAGES:
                            list(
                                pdf_bytes_to_png_pages(
                                    v[DataProps.PDF_CONTENT],
                                    dpi=90,  # not used if the image is scaled
                                    image_size=image_size,
                                    page_range=page_range))
                        }),
                    keys_to_remove={DataProps.PDF_CONTENT}),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)
            | "ComputerVisionPrediction" >> MapOrLog(
                lambda v: remove_keys_from_dict(extend_dict(
                    v, {
                        DataProps.CV_PREDICTION_PNG_PAGES:
                        inference_model_wrapper(v[DataProps.PDF_PNG_PAGES]),
                        DataProps.COLOR_MAP:
                        inference_model_wrapper.get_color_map()
                    }),
                                                keys_to_remove=
                                                {DataProps.PDF_PNG_PAGES}),
                error_count=MetricCounters.CV_PREDICTION_ERROR))

        if opt.save_cv_output:
            _ = (cv_predictions | "SaveComputerVisionOutput" >>
                 TransformAndLog(beam.Map(lambda v: save_pages(
                     get_pipeline_output_file(v[DataProps.SOURCE_FILENAME],
                                              OutputExt.CV_PNG), '.png',
                     [
                         image_data_to_png(image_data)
                         for image_data in v[DataProps.CV_PREDICTION_PNG_PAGES]
                     ])),
                                 log_fn=lambda x: get_logger().info(
                                     'saved cv output: %s', x)))

        cv_annotated_lxml = (
            cv_predictions | "AnnotateLxmlUsingCvPrediction" >>
            MapOrLog(lambda v: remove_keys_from_dict(extend_dict(
                v, {
                    DataProps.STRUCTURED_DOCUMENT:
                    (annotate_structured_document_using_predicted_image_data(
                        v[DataProps.STRUCTURED_DOCUMENT],
                        v[DataProps.CV_PREDICTION_PNG_PAGES],
                        v[DataProps.COLOR_MAP],
                        tag_scope=CV_TAG_SCOPE))
                }),
                                                     keys_to_remove=
                                                     {DataProps.PDF_PNG_PAGES
                                                      }),
                     error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR
                     ))

        lxml_content = cv_annotated_lxml
        extract_tag_scope = CV_TAG_SCOPE

    if opt.crf_model:
        model = load_crf_model(opt.crf_model)
        crf_annotated_lxml = (
            lxml_content | "AnnotateLxmlUsingCrfPrediction" >> MapOrLog(
                lambda v: extend_dict(
                    v, {
                        DataProps.STRUCTURED_DOCUMENT:
                        predict_and_annotate_structured_document(
                            v[DataProps.STRUCTURED_DOCUMENT], model)
                    }),
                error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR))

        lxml_content = crf_annotated_lxml
        extract_tag_scope = CRF_TAG_SCOPE

    if opt.save_annot_lxml:
        _ = (  # flake8: noqa
            lxml_content | "SaveAnnotLxml" >>
            TransformAndLog(beam.Map(lambda v: save_structured_document(
                get_pipeline_output_file(
                    v[DataProps.SOURCE_FILENAME],
                    get_annot_lxml_ext(crf_enabled=opt.crf_model,
                                       cv_enabled=cv_enabled)), v[
                                           DataProps.STRUCTURED_DOCUMENT])),
                            log_fn=lambda x: get_logger().info(
                                'saved annoted lxml to: %s', x)))
    return lxml_content, extract_tag_scope