Example #1
0
        def test_should_convert_multiple_article_authors_of_single_reference(
                self, scienceparse_jats_xslt):

            authors = [AUTHOR_1, AUTHOR_2]
            jats = etree.fromstring(
                scienceparse_jats_xslt({
                    'references': [
                        extend_dict(
                            REFERENCE_1, {
                                'authors': [
                                    '%s %s' %
                                    (author['first-name'], author['last-name'])
                                    for author in authors
                                ]
                            })
                    ]
                }))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')
            person_group = _get_item(element_citation, 'person-group')
            persons = person_group.xpath('name')
            assert len(persons) == 2

            for person, author in zip(persons, authors):
                assert _get_text(person, 'surname') == author['last-name']
                assert _get_text(person, 'given-names') == author['first-name']
def add_read_source_to_extracted_xml_pipeline_steps(p, opt,
                                                    get_pipeline_output_file):
    if opt.lxml_file_list:
        lxml_urls = p | ReadFileList(
            opt.lxml_file_list, column=opt.lxml_file_column, limit=opt.limit)

        annotated_lxml = (lxml_urls | PreventFusion(
        ) | "ReadLxmlContent" >> TransformAndCount(
            MapOrLog(
                lambda url: {
                    DataProps.SOURCE_FILENAME: url,
                    DataProps.STRUCTURED_DOCUMENT: load_structured_document(url
                                                                            )
                },
                error_count=MetricCounters.READ_LXML_ERROR),
            MetricCounters.FILES))

        extract_tag_scope = None
    else:
        annotated_lxml, extract_tag_scope = add_read_pdfs_to_annotated_lxml_pipeline_steps(
            p, opt, get_pipeline_output_file)

    extracted_xml = (annotated_lxml | "ExtractToXml" >> MapOrLog(
        lambda v: remove_keys_from_dict(extend_dict(
            v, {
                DataProps.EXTRACTED_XML:
                extract_annotated_structured_document_to_xml(
                    v[DataProps.STRUCTURED_DOCUMENT],
                    tag_scope=extract_tag_scope)
            }),
                                        keys_to_remove=
                                        {DataProps.STRUCTURED_DOCUMENT}),
        error_count=MetricCounters.EXTRACT_TO_XML_ERROR))

    if opt.use_grobid:
        enhancer = GrobidXmlEnhancer(opt.grobid_url,
                                     start_service=opt.start_grobid_service)
        extracted_xml = (extracted_xml | "GrobidEnhanceXml" >> MapOrLog(
            lambda v: extend_dict(v, {
                DataProps.EXTRACTED_XML:
                enhancer(v[DataProps.EXTRACTED_XML])
            }),
            error_count=MetricCounters.GROBID_ERROR))
    return extracted_xml
Example #3
0
    def test_should_use_process_header_if_includes_only_contains_header(
            self, config, args, grobid_service_instance):

        args.grobid_action = None
        _run_pipeline(
            config, args,
            extend_dict(PDF_INPUT,
                        {StepDataProps.INCLUDES: {FieldNames.TITLE}}))
        grobid_service_instance.assert_called_with(
            (PDF_INPUT['filename'], PDF_INPUT['content']),
            path=GrobidApiPaths.PROCESS_HEADER_DOCUMENT)
Example #4
0
    def test_should_use_process_full_text_if_includes_only_contains_references(
            self, config, args, grobid_service_instance):

        args.grobid_action = None
        _run_pipeline(
            config, args,
            extend_dict(PDF_INPUT,
                        {StepDataProps.INCLUDES: {FieldNames.REFERENCES}}))
        grobid_service_instance.assert_called_with(
            (PDF_INPUT['filename'], PDF_INPUT['content']),
            path=GrobidApiPaths.PROCESS_FULL_TEXT_DOCUMENT)
Example #5
0
        def test_should_convert_venue_as_source(self, scienceparse_jats_xslt):
            jats = etree.fromstring(
                scienceparse_jats_xslt({
                    'references':
                    [extend_dict(REFERENCE_1, {'venue': VALUE_1})]
                }))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation, 'source') == VALUE_1
        def test_should_convert_single_page_no(self, grobid_jats_xslt):
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1, page='page1'))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation, 'fpage') == 'page1'
            assert _get_text(element_citation, 'lpage') == 'page1'
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt):
    grobid_transformer = grobid_service(opt.grobid_url,
                                        opt.grobid_action,
                                        start_service=opt.start_grobid_service)

    return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent(
    ) | "Grobid" >> MapOrLog(lambda v: extend_dict(
        v, {
            DataProps.EXTRACTED_XML:
            grobid_transformer(
                (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1]
        }),
                             error_count=MetricCounters.GROBID_ERROR))
        def test_should_convert_year_and_month(self, grobid_jats_xslt):
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(
                            REFERENCE_1, year='2001', month='02'))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation, 'year') == '2001'
            assert _get_text(element_citation, 'month') == '02'
 def wrapper(x):
   data_type = x['type']
   if data_type in supported_types:
     get_logger().debug('excuting step %s: %s (%s)', step, x.keys(), data_type)
     result = extend_dict(x, step(x))
     get_logger().debug('result of step %s: %s (%s)', step, result.keys(), result.get('type'))
     processed_counter.inc()
     return result
   else:
     get_logger().debug(
       'skipping step %s, %s not in supported types (%s)', step, data_type, supported_types
     )
     ignored_counter.inc()
     return x
        def test_should_only_return_article_title_at_different_levels(
                self, grobid_jats_xslt, title_level):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1,
                                                 article_title=ARTICLE_TITLE_1,
                                                 title_level=title_level))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == ARTICLE_TITLE_1
def configure_pipeline(p, opt, pipeline, config):
  get_pipeline_output_file = lambda source_url, ext: get_output_file(
    source_url,
    opt.base_data_path,
    opt.output_path,
    ext
  )

  steps = pipeline.get_steps(config, opt)

  LOGGER.info('steps: %s', steps)

  input_data = (
    p |
    FileUrlSource(opt) |
    PreventFusion() |
    ReadFileContent() |
    "Determine Type" >> beam.Map(lambda d: extend_dict(d, {
      DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0]
    }))
  )

  result = input_data

  for step in steps:
    LOGGER.debug('step: %s', step)
    result |= get_step_transform(step)

  _ = (
    result |
    beam.Map(lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE]))
  )

  _ = (
    result |
    "WriteOutput" >> TransformAndLog(
      beam.Map(lambda v: save_file_content(
        get_pipeline_output_file(
          v[DataProps.SOURCE_FILENAME],
          opt.output_suffix
        ),
        encode_if_text_type(v[DataProps.CONTENT])
      )),
      log_fn=lambda x: get_logger().info('saved output to: %s', x)
    )
  )
        def test_should_only_return_article_title_even_if_collection_title_exists(
                self, grobid_jats_xslt):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(
                            **extend_dict(REFERENCE_1,
                                          article_title=ARTICLE_TITLE_1,
                                          collection_title=COLLECTION_TITLE_1))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == ARTICLE_TITLE_1
        def test_should_fallback_to_collection_title_if_article_title_does_not_exist(
                self, grobid_jats_xslt):

            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(
                            **extend_dict(REFERENCE_1,
                                          article_title=None,
                                          collection_title=COLLECTION_TITLE_1))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')

            assert _get_text(element_citation,
                             'article-title') == COLLECTION_TITLE_1
        def test_should_convert_multiple_collection_authors_of_single_reference(
                self, grobid_jats_xslt):
            authors = [AUTHOR_1, AUTHOR_2]
            jats = etree.fromstring(
                grobid_jats_xslt(
                    _tei(references=[
                        _reference(**extend_dict(REFERENCE_1,
                                                 collection_authors=authors))
                    ])))

            ref_list = _get_item(jats, 'back/ref-list')
            ref = _get_item(ref_list, 'ref')
            element_citation = _get_item(ref, 'element-citation')
            person_group = _get_item(element_citation, 'person-group')
            persons = person_group.xpath('name')
            assert len(persons) == 2

            for person, author in zip(persons, authors):
                assert _get_text(person, 'surname') == author['last-name']
                assert _get_text(person, 'given-names') == author['first-name']
def add_read_pdfs_to_annotated_lxml_pipeline_steps(p, opt,
                                                   get_pipeline_output_file):
    page_range = opt.pages

    cv_enabled = opt.cv_model_export_dir

    extract_tag_scope = None

    pdf_urls = p | PdfUrlSource(opt)

    lxml_content = (pdf_urls | PreventFusion() | ReadPdfContent()
                    | "ConvertPdfToLxml" >> MapOrLog(
                        lambda v: extend_dict(
                            v, {
                                DataProps.STRUCTURED_DOCUMENT:
                                convert_pdf_bytes_to_structured_document(
                                    v[DataProps.PDF_CONTENT],
                                    path=v[DataProps.SOURCE_FILENAME],
                                    page_range=page_range)
                            }),
                        log_fn=lambda e, v: (get_logger().warning(
                            'caught exception (ignoring item): %s, pdf: %s',
                            e,
                            v[DataProps.SOURCE_FILENAME],
                            exc_info=e)),
                        error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))

    if cv_enabled:
        image_size = ((opt.image_width, opt.image_height)
                      if opt.image_width and opt.image_height else None)
        inference_model_wrapper = InferenceModelWrapper(
            opt.cv_model_export_dir)

        cv_predictions = (
            lxml_content | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v,
                        {
                            DataProps.PDF_PNG_PAGES:
                            list(
                                pdf_bytes_to_png_pages(
                                    v[DataProps.PDF_CONTENT],
                                    dpi=90,  # not used if the image is scaled
                                    image_size=image_size,
                                    page_range=page_range))
                        }),
                    keys_to_remove={DataProps.PDF_CONTENT}),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR)
            | "ComputerVisionPrediction" >> MapOrLog(
                lambda v: remove_keys_from_dict(extend_dict(
                    v, {
                        DataProps.CV_PREDICTION_PNG_PAGES:
                        inference_model_wrapper(v[DataProps.PDF_PNG_PAGES]),
                        DataProps.COLOR_MAP:
                        inference_model_wrapper.get_color_map()
                    }),
                                                keys_to_remove=
                                                {DataProps.PDF_PNG_PAGES}),
                error_count=MetricCounters.CV_PREDICTION_ERROR))

        if opt.save_cv_output:
            _ = (cv_predictions | "SaveComputerVisionOutput" >>
                 TransformAndLog(beam.Map(lambda v: save_pages(
                     get_pipeline_output_file(v[DataProps.SOURCE_FILENAME],
                                              OutputExt.CV_PNG), '.png',
                     [
                         image_data_to_png(image_data)
                         for image_data in v[DataProps.CV_PREDICTION_PNG_PAGES]
                     ])),
                                 log_fn=lambda x: get_logger().info(
                                     'saved cv output: %s', x)))

        cv_annotated_lxml = (
            cv_predictions | "AnnotateLxmlUsingCvPrediction" >> MapOrLog(
                lambda v: remove_keys_from_dict(extend_dict(
                    v, {
                        DataProps.STRUCTURED_DOCUMENT:
                        annotate_structured_document_using_predicted_image_data(
                            v[DataProps.STRUCTURED_DOCUMENT],
                            v[DataProps.CV_PREDICTION_PNG_PAGES],
                            v[DataProps.COLOR_MAP],
                            tag_scope=CV_TAG_SCOPE)
                    }),
                                                keys_to_remove=
                                                {DataProps.PDF_PNG_PAGES}),
                error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR))

        lxml_content = cv_annotated_lxml
        extract_tag_scope = CV_TAG_SCOPE

    if opt.crf_model:
        model = load_crf_model(opt.crf_model)
        crf_annotated_lxml = (
            lxml_content | "AnnotateLxmlUsingCrfPrediction" >> MapOrLog(
                lambda v: extend_dict(
                    v, {
                        DataProps.STRUCTURED_DOCUMENT:
                        predict_and_annotate_structured_document(
                            v[DataProps.STRUCTURED_DOCUMENT], model)
                    }),
                error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR))

        lxml_content = crf_annotated_lxml
        extract_tag_scope = CRF_TAG_SCOPE

    if opt.save_annot_lxml:
        _ = (lxml_content | "SaveAnnotLxml" >> TransformAndLog(
            beam.Map(lambda v: save_structured_document(
                get_pipeline_output_file(
                    v[DataProps.SOURCE_FILENAME],
                    get_annot_lxml_ext(crf_enabled=opt.crf_model,
                                       cv_enabled=cv_enabled)), v[
                                           DataProps.STRUCTURED_DOCUMENT])),
            log_fn=lambda x: get_logger().info('saved annoted lxml to: %s', x))
             )
    return lxml_content, extract_tag_scope