コード例 #1
0
 def test_should_not_call_get_cloud_project_if_cloud_is_false(
         self, get_cloud_project_mock):
     args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, {
         'cloud': False
     }))
     process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH)
     get_cloud_project_mock.assert_not_called()
コード例 #2
0
def read_examples(filenames,
                  shuffle,
                  num_epochs=None,
                  page_range=None,
                  channel_colors=None):

    # Convert num_epochs == 0 -> num_epochs is None, if necessary
    num_epochs = num_epochs or None

    feature_map = DEFAULT_FEATURE_MAP
    if page_range is not None:
        feature_map = extend_dict(feature_map, PAGE_NO_FEATURE)

    map_keys_tracker = MapKeysTracker()

    dataset = TFRecordDataset(filenames, compression_type='GZIP')
    dataset = dataset.map(
        map_keys_tracker.wrap(partial(parse_example, feature_map=feature_map)))
    if page_range is not None:
        dataset = dataset.filter(lambda *x: page_no_is_within(
            map_keys_tracker.unwrap(x)['page_no'], page_range))
    if channel_colors is not None:
        dataset = dataset.filter(lambda *x: image_contains_any_of_the_colors(
            map_keys_tracker.unwrap(x)['annotation_image'], channel_colors))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.repeat(num_epochs)

    return map_keys_tracker.unwrap(dataset.make_one_shot_iterator().get_next())
コード例 #3
0
    def test_should_convert_multiple_article_authors_of_single_reference(
      self, scienceparse_jats_xslt):

      authors = [AUTHOR_1, AUTHOR_2]
      jats = etree.fromstring(scienceparse_jats_xslt({
        'references': [
          extend_dict(REFERENCE_1, {
            'authors': [
              '%s %s' % (author['first-name'], author['last-name'])
              for author in authors
            ]
          })
        ]
      }))

      ref_list = _get_item(jats, 'back/ref-list')
      ref = _get_item(ref_list, 'ref')
      element_citation = _get_item(ref, 'element-citation')
      person_group = _get_item(element_citation, 'person-group')
      persons = person_group.xpath('name')
      assert len(persons) == 2

      for person, author in zip(persons, authors):
        assert _get_text(person, 'surname') == author['last-name']
        assert _get_text(person, 'given-names') == author['first-name']
コード例 #4
0
 def test_should_filter_by_channel_colors(self):
     with patch.object(examples_module,
                       'TFRecordDataset') as TFRecordDataset:
         with tf.Graph().as_default():
             TFRecordDataset.return_value = list_dataset([
                 dict_to_example(
                     extend_dict(
                         EXAMPLE_PROPS_1,
                         page_no=page_no,
                         annotation_image=image_with_color(
                             some_color(page_no)))).SerializeToString()
                 for page_no in [1, 2, 3, 4]
             ], tf.string)
             examples = read_examples(
                 DATA_PATH,
                 shuffle=False,
                 num_epochs=1,
                 page_range=(0, 100),
                 channel_colors=[some_color(i) for i in [2, 3]])
             TFRecordDataset.assert_called_with(DATA_PATH,
                                                compression_type='GZIP')
             with tf.Session() as session:
                 assert [
                     x['page_no']
                     for x in fetch_examples(session, examples)
                 ] == [2, 3]
コード例 #5
0
 def test_should_not_call_get_cloud_project_if_project_was_specified(
         self, get_cloud_project_mock):
     args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, {
         'cloud': True,
         'project': PROJECT_1
     }))
     process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH)
     get_cloud_project_mock.assert_not_called()
コード例 #6
0
 def test_should_use_get_cloud_project_if_project_is_empty(
         self, get_cloud_project_mock):
     get_cloud_project_mock.return_value = PROJECT_1
     args = argparse.Namespace(**extend_dict(DEFAULT_ARGS, {
         'cloud': True,
         'project': None
     }))
     process_cloud_args(args, output_path=DEFAULT_OUTPUT_PATH)
     assert args.project == PROJECT_1  # pylint: disable=no-member
コード例 #7
0
def add_read_source_to_extracted_xml_pipeline_steps(p, opt,
                                                    get_pipeline_output_file):
    if opt.lxml_file_list:
        lxml_urls = p | ReadFileList(
            opt.lxml_file_list, column=opt.lxml_file_column, limit=opt.limit)

        annotated_lxml = (lxml_urls | PreventFusion(
        ) | "ReadLxmlContent" >> TransformAndCount(
            MapOrLog(
                lambda url: {
                    DataProps.SOURCE_FILENAME: url,
                    DataProps.STRUCTURED_DOCUMENT: load_structured_document(url
                                                                            )
                },
                error_count=MetricCounters.READ_LXML_ERROR),
            MetricCounters.FILES))

        extract_tag_scope = None
    else:
        annotated_lxml, extract_tag_scope = add_read_pdfs_to_annotated_lxml_pipeline_steps(
            p, opt, get_pipeline_output_file)

    extracted_xml = (annotated_lxml | "ExtractToXml" >> MapOrLog(
        lambda v: remove_keys_from_dict(extend_dict(
            v, {
                DataProps.EXTRACTED_XML:
                extract_annotated_structured_document_to_xml(
                    v[DataProps.STRUCTURED_DOCUMENT],
                    tag_scope=extract_tag_scope)
            }),
                                        keys_to_remove=
                                        {DataProps.STRUCTURED_DOCUMENT}),
        error_count=MetricCounters.EXTRACT_TO_XML_ERROR))

    if opt.use_grobid:
        enhancer = GrobidXmlEnhancer(opt.grobid_url,
                                     start_service=opt.start_grobid_service)
        extracted_xml = (extracted_xml | "GrobidEnhanceXml" >> MapOrLog(
            lambda v: extend_dict(v, {
                DataProps.EXTRACTED_XML:
                enhancer(v[DataProps.EXTRACTED_XML])
            }),
            error_count=MetricCounters.GROBID_ERROR))
    return extracted_xml
コード例 #8
0
    def test_should_use_process_header_if_includes_only_contains_header(
            self, config, args, grobid_service_instance):

        args.grobid_action = None
        _run_pipeline(
            config, args,
            extend_dict(PDF_INPUT,
                        {StepDataProps.INCLUDES: {FieldNames.TITLE}}))
        grobid_service_instance.assert_called_with(
            (PDF_INPUT['filename'], PDF_INPUT['content']),
            path=GrobidApiPaths.PROCESS_HEADER_DOCUMENT)
コード例 #9
0
    def test_should_use_process_full_text_if_includes_only_contains_references(
            self, config, args, grobid_service_instance):

        args.grobid_action = None
        _run_pipeline(
            config, args,
            extend_dict(PDF_INPUT,
                        {StepDataProps.INCLUDES: {FieldNames.REFERENCES}}))
        grobid_service_instance.assert_called_with(
            (PDF_INPUT['filename'], PDF_INPUT['content']),
            path=GrobidApiPaths.PROCESS_FULL_TEXT_DOCUMENT)
コード例 #10
0
    def test_should_convert_venue_as_source(self, scienceparse_jats_xslt):
      jats = etree.fromstring(scienceparse_jats_xslt({
        'references': [
          extend_dict(REFERENCE_1, {'venue': VALUE_1})
        ]
      }))

      ref_list = _get_item(jats, 'back/ref-list')
      ref = _get_item(ref_list, 'ref')
      element_citation = _get_item(ref, 'element-citation')

      assert _get_text(element_citation, 'source') == VALUE_1
コード例 #11
0
        def test_should_convert_single_page_no(self, grobid_jats_xslt):
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1, page='page1'))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation, 'fpage') == 'page1'
            assert _get_text(element_citation, 'lpage') == 'page1'
コード例 #12
0
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt):
    grobid_transformer = grobid_service(opt.grobid_url,
                                        opt.grobid_action,
                                        start_service=opt.start_grobid_service)

    return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent(
    ) | "Grobid" >> MapOrLog(lambda v: extend_dict(
        v, {
            DataProps.EXTRACTED_XML:
            grobid_transformer(
                (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1]
        }),
                             error_count=MetricCounters.GROBID_ERROR))
コード例 #13
0
        def test_should_convert_year_and_month(self, grobid_jats_xslt):
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(
                            REFERENCE_1, year='2001', month='02'))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation, 'year') == '2001'
            assert _get_text(element_citation, 'month') == '02'
コード例 #14
0
    def test_should_use_unknown_class_weight_from_configuration(
            self, parse_color_map_from_file_mock, parse_json_file_mock):

        parse_color_map_from_file_mock.return_value = SOME_COLOR_MAP
        parse_json_file_mock.return_value = extend_dict(
            SOME_CLASS_WEIGHTS, {'unknown': 0.99})
        args = create_args(DEFAULT_ARGS,
                           base_loss=BaseLoss.WEIGHTED_CROSS_ENTROPY,
                           color_map=COLOR_MAP_FILENAME,
                           class_weights=CLASS_WEIGHTS_FILENAME,
                           use_separate_channels=True,
                           use_unknown_class=True)
        model = Model(args)
        assert model.pos_weight[-1] == 0.99
コード例 #15
0
    def wrapper(x):
        data_type = x['type']
        if data_type in supported_types:
            get_logger().debug('excuting step %s: %s (%s)', step, x.keys(),
                               data_type)
            result = extend_dict(x, step(x))
            get_logger().debug('result of step %s: %s (%s)', step,
                               result.keys(), result.get('type'))
            processed_counter.inc()
            return result

        get_logger().debug('skipping step %s, %s not in supported types (%s)',
                           step, data_type, supported_types)
        ignored_counter.inc()
        return x
コード例 #16
0
        def test_should_only_return_article_title_at_different_levels(
                self, grobid_jats_xslt, title_level):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1,
                                                 article_title=ARTICLE_TITLE_1,
                                                 title_level=title_level))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == ARTICLE_TITLE_1
コード例 #17
0
        def test_should_fallback_to_collection_title_if_article_title_does_not_exist(
                self, grobid_jats_xslt):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(
                            **extend_dict(REFERENCE_1,
                                          article_title=None,
                                          collection_title=COLLECTION_TITLE_1))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == COLLECTION_TITLE_1
コード例 #18
0
        def test_should_only_return_article_title_even_if_collection_title_exists(
                self, grobid_jats_xslt):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(
                            **extend_dict(REFERENCE_1,
                                          article_title=ARTICLE_TITLE_1,
                                          collection_title=COLLECTION_TITLE_1))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == ARTICLE_TITLE_1
コード例 #19
0
def configure_pipeline(p, opt, pipeline, config):
    def get_pipeline_output_file(source_url, ext):
        return get_output_file(source_url, opt.base_data_path, opt.output_path,
                               ext)

    def get_default_output_file_for_source_file(source_url):
        return get_pipeline_output_file(source_url, opt.output_suffix)

    def output_file_not_exists(source_url):
        return not _file_exists(
            get_default_output_file_for_source_file(source_url))

    steps = pipeline.get_steps(config, opt)

    LOGGER.info('steps: %s', steps)

    input_urls = (p | FileUrlSource(opt) | PreventFusion())

    if opt.resume:
        input_urls |= beam.Filter(output_file_not_exists)

    input_data = (input_urls | ReadFileContent()
                  | "Determine Type" >> beam.Map(lambda d: extend_dict(
                      d, {
                          DataProps.TYPE:
                          mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0]
                      })))

    result = input_data

    for step in steps:
        LOGGER.debug('step: %s', step)
        result |= get_step_transform(step)

    _ = (result | beam.Map(
        lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE])))

    _ = (  # noqa: F841
        result | "WriteOutput" >> TransformAndLog(
            beam.Map(lambda v: save_file_content(
                get_default_output_file_for_source_file(v[DataProps.
                                                          SOURCE_FILENAME]),
                encode_if_text_type(v[DataProps.CONTENT]))),
            log_fn=lambda x: get_logger().info('saved output to: %s', x)))
コード例 #20
0
        def test_should_convert_multiple_collection_authors_of_single_reference(
                self, grobid_jats_xslt):
            authors = [AUTHOR_1, AUTHOR_2]
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1,
                                                 collection_authors=authors))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')
            person_group = _get_item(element_citation, 'person-group')
            persons = person_group.xpath('name')
            assert len(persons) == 2

            for person, author in zip(persons, authors):
                assert _get_text(person, 'surname') == author['last-name']
                assert _get_text(person, 'given-names') == author['first-name']
コード例 #21
0
def get_sentence_char_features(
        char_index,
        char_by_index_map,
        word_index_by_char_index_map,
        word_by_index_map):
    word_index = word_index_by_char_index_map.get(char_index, -10)
    d = extend_dict(
        {},
        get_char_features('char', char_by_index_map.get(char_index, '')),
        get_word_features('word', word_by_index_map.get(word_index, '')),
        {
            'char_index': char_index,
            'word_index': word_index,
            'bias': 1.0
        }
    )
    for i in range(1, 1 + 3):
        d.update(get_char_features('char[-%d]' % i, char_by_index_map.get(char_index - i, '')))
        d.update(get_char_features('char[+%d]' % i, char_by_index_map.get(char_index + i, '')))
        d.update(get_word_features('word[-%d]' % i, word_by_index_map.get(word_index - i, '')))
        d.update(get_word_features('word[+%d]' % i, word_by_index_map.get(word_index + i, '')))
    return d
コード例 #22
0
 def process_request(self,
                     data: dict,
                     session: requests.Session,
                     context: dict = None):
     root = etree.fromstring(data['content'])
     matching_nodes = root.xpath(self._xpath)
     if not matching_nodes:
         LOGGER.info('xpath not matching any element: %s', self._xpath)
         return data
     for node in matching_nodes:
         value = get_text_content(node)
         LOGGER.debug('node for xpath %s: %s (text: %s)', self._xpath, node,
                      value)
         response = session.post(
             self._api_url,
             data=value.encode('utf-8'),
             timeout=self.get_default_request_timeout(context=context))
         response.raise_for_status()
         revised_value = response.text
         LOGGER.debug('revised_value: %s (was: %s)', revised_value, value)
         if revised_value != value:
             apply_revised_value(node, revised_value)
     return extend_dict(data, {'content': etree.tostring(root)})
コード例 #23
0
def configure_pipeline(p, opt, pipeline, config):
    get_default_output_file_for_source_file = get_output_file_for_source_file_fn(
        opt)
    file_list = get_remaining_file_list_for_args(opt)
    LOGGER.debug('file_list: %s', file_list)

    if not file_list:
        LOGGER.info('no files to process')
        return

    steps = pipeline.get_steps(config, opt)

    LOGGER.info('steps: %s', steps)

    input_urls = (p | beam.Create(file_list) | PreventFusion())

    input_data = (input_urls | ReadFileContent()
                  | "Determine Type" >> beam.Map(lambda d: extend_dict(
                      d, {
                          DataProps.TYPE:
                          mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0]
                      })))

    result = input_data

    for step in steps:
        LOGGER.debug('step: %s', step)
        result |= get_step_transform(step)

    _ = (  # noqa: F841
        result | "WriteOutput" >> TransformAndLog(
            beam.Map(lambda v: save_file_content(
                get_default_output_file_for_source_file(v[DataProps.
                                                          SOURCE_FILENAME]),
                encode_if_text_type(v[DataProps.CONTENT]))),
            log_fn=lambda x: get_logger().info('saved output to: %s', x)))
コード例 #24
0
def patch_preprocessing_pipeline(**kwargs):
    always_mock = {
        'find_file_pairs_grouped_by_parent_directory_or_name',
        'read_all_from_path', 'pdf_bytes_to_png_pages',
        'convert_pdf_bytes_to_lxml', 'convert_and_annotate_lxml_content',
        'svg_page_to_blockified_png_bytes', 'save_svg_roots', 'save_pages',
        'evaluate_document_by_page', 'ReadDictCsv'
    }
    tfrecords_mock = Mock(name='tfrecords_mock')

    def DummyWritePropsToTFRecord(file_path, extract_props):
        return TransformAndLog(
            beam.Map(
                lambda v: tfrecords_mock(file_path, list(extract_props(v)))),
            log_fn=lambda x: get_logger().info('tfrecords: %s', x))

    with patch.multiple(PREPROCESSING_PIPELINE,
                        WritePropsToTFRecord=DummyWritePropsToTFRecord,
                        **{k: kwargs.get(k, DEFAULT)
                           for k in always_mock}) as mocks:
        get_current_test_context().mocks = mocks
        mocks['read_all_from_path'].side_effect = fake_content
        mocks['convert_pdf_bytes_to_lxml'].side_effect = fake_lxml_for_pdf
        yield extend_dict(mocks, {'tfrecords': tfrecords_mock})
コード例 #25
0
def configure_pipeline(p, opt):
    image_size = ((opt.image_width, opt.image_height)
                  if opt.image_width and opt.image_height else None)
    page_range = opt.pages
    first_page = page_range[0] if page_range else 1
    xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
    if opt.lxml_path:
        lxml_xml_file_pairs = (
            p | beam.Create(
                [[
                    join_if_relative_path(opt.base_data_path, s)
                    for s in [opt.lxml_path, opt.xml_path]
                ]]) | "FindFilePairs" >> TransformAndLog(
                    beam.FlatMap(lambda patterns: islice(
                        find_file_pairs_grouped_by_parent_directory_or_name(
                            patterns), opt.limit)),
                    log_prefix='file pairs: ',
                    log_level='debug') | PreventFusion()
            | "ReadFileContent" >> beam.Map(
                lambda filenames: {
                    'source_filename': filenames[0],
                    'xml_filename': filenames[1],
                    'lxml_content': read_all_from_path(filenames[0]),
                    'xml_content': read_all_from_path(filenames[1])
                }))
    elif opt.pdf_path or opt.pdf_xml_file_list:
        if opt.pdf_xml_file_list:
            pdf_xml_url_pairs = (
                p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list,
                                                      limit=opt.limit)
                | "TranslateFilePairUrls" >>
                beam.Map(lambda row: (row['source_url'], row['xml_url'])))
        else:
            pdf_xml_url_pairs = (p | beam.Create([[
                join_if_relative_path(opt.base_data_path, s)
                for s in [opt.pdf_path, opt.xml_path]
            ]]) | "FindFilePairs" >> TransformAndLog(
                beam.FlatMap(lambda patterns: islice(
                    find_file_pairs_grouped_by_parent_directory_or_name(
                        patterns), opt.limit)),
                log_prefix='file pairs: ',
                log_level='debug'))
        pdf_xml_file_pairs = (
            pdf_xml_url_pairs | PreventFusion()
            | "ReadFileContent" >> TransformAndCount(
                beam.Map(
                    lambda filenames: {
                        'source_filename': filenames[0],
                        'xml_filename': filenames[1],
                        'pdf_content': read_all_from_path(filenames[0]),
                        'xml_content': read_all_from_path(filenames[1])
                    }), MetricCounters.FILE_PAIR))

        lxml_xml_file_pairs = (
            pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'lxml_content':
                            convert_pdf_bytes_to_lxml(v['pdf_content'],
                                                      path=v['source_filename'
                                                             ],
                                                      page_range=page_range)
                        }),
                    # we don't need the pdf_content unless we are writing tf_records
                    None if opt.save_tfrecords else {'pdf_content'}),
                log_fn=lambda e, v: (get_logger().warning(
                    'caught exception (ignoring item): %s, pdf: %s, xml: %s',
                    e,
                    v['source_filename'],
                    v['xml_filename'],
                    exc_info=e)),
                error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))
    else:
        raise RuntimeError('either lxml-path or pdf-path required')

    if opt.save_png or opt.save_tfrecords:
        with_pdf_png_pages = (
            (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs)
            | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'pdf_png_pages':
                            list(
                                pdf_bytes_to_png_pages(v['pdf_content'],
                                                       dpi=opt.png_dpi,
                                                       image_size=image_size,
                                                       page_range=page_range))
                        }),
                    {'pdf_content'}  # we no longer need the pdf_content
                ),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR))

        if opt.save_png:
            _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.png.zip')),
                    '.png', v['pdf_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.save_lxml:
        _ = (lxml_xml_file_pairs
             | "SaveLxml" >> TransformAndLog(
                 beam.Map(lambda v: save_file_content(
                     FileSystems.join(
                         opt.output_path,
                         change_ext(
                             relative_path(opt.base_data_path, v[
                                 'source_filename']), None, '.lxml.gz')), v[
                                     'lxml_content'])),
                 log_fn=lambda x: get_logger().info('saved lxml: %s', x)))

    annotation_results = ((
        with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs
    ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount(
        MapOrLog(
            lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'svg_pages':
                        list(
                            convert_and_annotate_lxml_content(
                                v['lxml_content'],
                                v['xml_content'],
                                xml_mapping,
                                name=v['source_filename']))
                    }),
                # Won't need the XML anymore
                {'lxml_content', 'xml_content'}),
            log_fn=lambda e, v: (get_logger().warning(
                'caught exception (ignoring item): %s, source: %s, xml: %s',
                e,
                v['source_filename'],
                v['xml_filename'],
                exc_info=e)),
            error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR),
        MetricCounters.PAGE,
        lambda v: len(v['svg_pages'])))

    if opt.save_svg:
        _ = (annotation_results | "SaveSvgPages" >> TransformAndLog(
            beam.Map(lambda v: save_svg_roots(
                FileSystems.join(
                    opt.output_path,
                    change_ext(
                        relative_path(opt.base_data_path, v['source_filename']
                                      ), None, '.svg.zip')), v['svg_pages'])),
            log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.annotation_evaluation_csv or opt.min_annotation_percentage:
        annotation_evaluation_results = (
            annotation_results | "EvaluateAnnotations" >> TransformAndLog(
                beam.Map(lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'annotation_evaluation':
                            evaluate_document_by_page(
                                SvgStructuredDocument(v['svg_pages']))
                        }), None
                    if opt.min_annotation_percentage else {'svg_pages'})),
                log_fn=lambda x: get_logger().info(
                    'annotation evaluation result: %s: %s', x[
                        'source_filename'], x['annotation_evaluation'])))

    if opt.save_block_png or opt.save_tfrecords:
        color_map = parse_color_map_from_file(opt.color_map)
        with_block_png_pages = (
            (annotation_evaluation_results
             if opt.min_annotation_percentage else annotation_results)
            | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'block_png_pages': [
                            svg_page_to_blockified_png_bytes(
                                svg_page, color_map, image_size=image_size)
                            for svg_page in v['svg_pages']
                        ]
                    }), {'svg_pages'})))

        if opt.save_block_png:
            _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.block-png.zip')),
                    '.png', v['block_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

        if opt.save_tfrecords:
            if opt.min_annotation_percentage:
                filtered_pages = (
                    with_block_png_pages | "FilterPages" >> TransformAndCount(
                        beam.Map(lambda v: filter_list_props_by_indices(
                            v,
                            get_page_indices_with_min_annotation_percentage(
                                v['annotation_evaluation'], opt.
                                min_annotation_percentage),
                            {'pdf_png_pages', 'block_png_pages'})),
                        MetricCounters.FILTERED_PAGE,
                        lambda v: len(v['block_png_pages'])))
            else:
                filtered_pages = with_block_png_pages
            _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord(
                FileSystems.join(opt.output_path, 'data'), lambda v: ({
                    'input_uri':
                    v['source_filename'] + '#page%d' % (first_page + i),
                    'input_image':
                    pdf_png_page,
                    'annotation_uri':
                    (v['source_filename'] + '.annot' + '#page%d' %
                     (first_page + i)),
                    'annotation_image':
                    block_png_page,
                    'page_no':
                    first_page + i
                } for i, pdf_png_page, block_png_page in zip(
                    range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[
                        'block_png_pages']))))

    if opt.annotation_evaluation_csv:
        annotation_evaluation_csv_name, annotation_evaluation_ext = (
            os.path.splitext(opt.annotation_evaluation_csv))
        _ = (  # flake8: noqa
            annotation_evaluation_results | "FlattenAnotationEvaluationResults"
            >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows(
                v['annotation_evaluation'],
                document=basename(v['source_filename'])))
            | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv(
                join_if_relative_path(opt.output_path,
                                      annotation_evaluation_csv_name),
                file_name_suffix=annotation_evaluation_ext,
                columns=DEFAULT_EVALUATION_COLUMNS))
コード例 #26
0
        assert class_weights_to_pos_weight({
            'a': 0.1,
            'b': 0.2,
            'c': 0.3
        }, ['a', 'b'], True, DEFAULT_UNKNOWN_CLASS_WEIGHT) == (
            [0.1, 0.2, DEFAULT_UNKNOWN_CLASS_WEIGHT]
        )


DEFAULT_ARGS = extend_dict(
    CORE_DEFAULT_ARGS,
    dict(
        pages=None,
        color_map=None,
        class_weights=None,
        channels=None,
        filter_annotated=False,
        use_separate_channels=False,
        use_unknown_class=False,
        debug=False
    )
)


def create_args(*args, **kwargs):
    d = extend_dict(*list(args) + [kwargs])
    return namedtuple('args', d.keys())(**d)


@pytest.mark.usefixtures(
    'parse_json_file_mock'
コード例 #27
0
 def test_should_preserve_other_input_props(self, config, args):
     result = _run_pipeline(config, args, extend_dict(
         _generate_content_with_title(TITLE_1),
         {'other': 'other1'}
     ))
     assert result['other'] == 'other1'
コード例 #28
0
 def test_should_parse_namespaces(self):
     opt = parse_args(
         _get_argv(
             extend_dict(DEFAULT_ARGS,
                         {'namespaces': '{"xyz": "http://xyz"}'})))
     assert opt.namespaces == {'xyz': 'http://xyz'}
コード例 #29
0
def add_read_pdfs_to_annotated_lxml_pipeline_steps(p, opt,
                                                   get_pipeline_output_file):
    page_range = opt.pages

    cv_enabled = opt.cv_model_export_dir

    extract_tag_scope = None

    pdf_urls = p | PdfUrlSource(opt)

    lxml_content = (pdf_urls | PreventFusion() | ReadPdfContent()
                    | "ConvertPdfToLxml" >> MapOrLog(
                        lambda v: extend_dict(
                            v, {
                                DataProps.STRUCTURED_DOCUMENT:
                                convert_pdf_bytes_to_structured_document(
                                    v[DataProps.PDF_CONTENT],
                                    path=v[DataProps.SOURCE_FILENAME],
                                    page_range=page_range)
                            }),
                        log_fn=lambda e, v: (get_logger().warning(
                            'caught exception (ignoring item): %s, pdf: %s',
                            e,
                            v[DataProps.SOURCE_FILENAME],
                            exc_info=e)),
                        error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))

    if cv_enabled:
        image_size = ((opt.image_width, opt.image_height)
                      if opt.image_width and opt.image_height else None)
        inference_model_wrapper = InferenceModelWrapper(
            opt.cv_model_export_dir)

        cv_predictions = (
            lxml_content | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v,
                        {
                            DataProps.PDF_PNG_PAGES:
                            list(
                                pdf_bytes_to_png_pages(
                                    v[DataProps.PDF_CONTENT],
                                    dpi=90,  # not used if the image is scaled
                                    image_size=image_size,
                                    page_range=page_range))
                        }),
                    keys_to_remove={DataProps.PDF_CONTENT}),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)
            | "ComputerVisionPrediction" >> MapOrLog(
                lambda v: remove_keys_from_dict(extend_dict(
                    v, {
                        DataProps.CV_PREDICTION_PNG_PAGES:
                        inference_model_wrapper(v[DataProps.PDF_PNG_PAGES]),
                        DataProps.COLOR_MAP:
                        inference_model_wrapper.get_color_map()
                    }),
                                                keys_to_remove=
                                                {DataProps.PDF_PNG_PAGES}),
                error_count=MetricCounters.CV_PREDICTION_ERROR))

        if opt.save_cv_output:
            _ = (cv_predictions | "SaveComputerVisionOutput" >>
                 TransformAndLog(beam.Map(lambda v: save_pages(
                     get_pipeline_output_file(v[DataProps.SOURCE_FILENAME],
                                              OutputExt.CV_PNG), '.png',
                     [
                         image_data_to_png(image_data)
                         for image_data in v[DataProps.CV_PREDICTION_PNG_PAGES]
                     ])),
                                 log_fn=lambda x: get_logger().info(
                                     'saved cv output: %s', x)))

        cv_annotated_lxml = (
            cv_predictions | "AnnotateLxmlUsingCvPrediction" >>
            MapOrLog(lambda v: remove_keys_from_dict(extend_dict(
                v, {
                    DataProps.STRUCTURED_DOCUMENT:
                    (annotate_structured_document_using_predicted_image_data(
                        v[DataProps.STRUCTURED_DOCUMENT],
                        v[DataProps.CV_PREDICTION_PNG_PAGES],
                        v[DataProps.COLOR_MAP],
                        tag_scope=CV_TAG_SCOPE))
                }),
                                                     keys_to_remove=
                                                     {DataProps.PDF_PNG_PAGES
                                                      }),
                     error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR
                     ))

        lxml_content = cv_annotated_lxml
        extract_tag_scope = CV_TAG_SCOPE

    if opt.crf_model:
        model = load_crf_model(opt.crf_model)
        crf_annotated_lxml = (
            lxml_content | "AnnotateLxmlUsingCrfPrediction" >> MapOrLog(
                lambda v: extend_dict(
                    v, {
                        DataProps.STRUCTURED_DOCUMENT:
                        predict_and_annotate_structured_document(
                            v[DataProps.STRUCTURED_DOCUMENT], model)
                    }),
                error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR))

        lxml_content = crf_annotated_lxml
        extract_tag_scope = CRF_TAG_SCOPE

    if opt.save_annot_lxml:
        _ = (  # flake8: noqa
            lxml_content | "SaveAnnotLxml" >>
            TransformAndLog(beam.Map(lambda v: save_structured_document(
                get_pipeline_output_file(
                    v[DataProps.SOURCE_FILENAME],
                    get_annot_lxml_ext(crf_enabled=opt.crf_model,
                                       cv_enabled=cv_enabled)), v[
                                           DataProps.STRUCTURED_DOCUMENT])),
                            log_fn=lambda x: get_logger().info(
                                'saved annoted lxml to: %s', x)))
    return lxml_content, extract_tag_scope
コード例 #30
0
def create_args(*args, **kwargs):
    d = extend_dict(*list(args) + [kwargs])
    return namedtuple('args', d.keys())(**d)