Beispiel #1
0
 def __init__(self, grobid_url, start_service):
     self.process_header_names = grobid_service(
         grobid_url,
         GrobidApiPaths.PROCESS_HEADER_NAMES,
         start_service=start_service,
         field_name='names')
     self.process_affiliations = grobid_service(
         grobid_url,
         GrobidApiPaths.PROCESS_AFFILIATIONS,
         start_service=start_service,
         field_name='affiliations')
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt):
    grobid_transformer = grobid_service(opt.grobid_url,
                                        opt.grobid_action,
                                        start_service=opt.start_grobid_service)

    return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent(
    ) | "Grobid" >> MapOrLog(lambda v: extend_dict(
        v, {
            DataProps.EXTRACTED_XML:
            grobid_transformer(
                (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1]
        }),
                             error_count=MetricCounters.GROBID_ERROR))
Beispiel #3
0
def configure_pipeline(p, opt):
  # read the files and create a collection with filename, content tuples
  pcoll = p | ReadFileNamesAndContent(opt.input)

  # map the pdf content to xml using Grobid
  # (grobid_service either accepts the content or tuples)
  output = pcoll | beam.Map(grobid_service(
    opt.grobid_url, opt.grobid_action, start_service=opt.start_grobid_service
  ))

  if opt.xslt_path:
    output |= MapValues(xslt_transformer_from_file(opt.xslt_path))

  # change the key (filename) from pdf to xml to reflect the new content
  output |= MapKeys(lambda k: '%s/%s' % (opt.output_path, splitext(k)[0].split('/')[-1] + opt.output_suffix))

  # write the files, using the key as the filename
  output |= WriteToFile()
def run(argv=None):
  """Main entry point; defines and runs the tfidf pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
    '--input',
    required=True,
    help='Input file pattern to process.')
  parser.add_argument(
    '--output-suffix',
    required=False,
    default='.tei-header.xml',
    help='Output file suffix to add to the filename (excluding the file extension).')
  parser.add_argument(
    '--grobid-url',
    required=False,
    default='http://localhost:8080',
    help='Base URL to the Grobid service')
  parser.add_argument(
    '--grobid-action',
    required=False,
    default=PROCESS_HEADER_DOCUMENT_PATH,
    help='Name of the Grobid action')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  with beam.Pipeline(options=pipeline_options) as p:
    # read the files and create a collection with filename, content tuples
    pcoll = p | ReadFileNamesAndContent(known_args.input)

    # map the pdf content to xml using Grobid
    # (grobid_service either accepts the content or tuples)
    output = pcoll | beam.Map(grobid_service(
      known_args.grobid_url, known_args.grobid_action
    ))

    # change the key (filename) from pdf to xml to reflect the new content
    output |= MapKeys(lambda k: splitext(k)[0] + known_args.output_suffix)

    # write the files, using the key as the filename
    output |= WriteToFile()
Beispiel #5
0
    def get_steps(self, config, args):
        # type: (dict, object) -> list
        grobid_url = args.grobid_url
        if not grobid_url:
            grobid_url = LOCAL_GROBID_API_URL
            start_grobid_service = True
        else:
            start_grobid_service = False

        call_grobid = grobid_service(grobid_url,
                                     args.grobid_action,
                                     start_service=start_grobid_service)

        def convert_to_tei(pdf_filename, pdf_content, includes):
            return call_grobid(
                (pdf_filename, pdf_content),
                path=args.grobid_action
                or get_default_grobid_action_for_fields(includes))[1]

        steps = [
            FunctionPipelineStep(
                lambda data: {
                    StepDataProps.CONTENT:
                    convert_to_tei(pdf_filename=data[StepDataProps.FILENAME],
                                   pdf_content=data[StepDataProps.CONTENT],
                                   includes=data.get(StepDataProps.INCLUDES)),
                    StepDataProps.TYPE:
                    MimeTypes.TEI_XML
                }, {MimeTypes.PDF}, 'Convert to TEI')
        ]
        if not args.no_grobid_xslt:
            xslt_transformer = xslt_transformer_from_file(
                args.grobid_xslt_path,
                pretty_print=not args.no_grobid_pretty_print)
            steps.append(
                FunctionPipelineStep(
                    lambda d: {
                        StepDataProps.CONTENT:
                        xslt_transformer(d[StepDataProps.CONTENT]),
                        StepDataProps.TYPE:
                        MimeTypes.JATS_XML
                    }, {MimeTypes.TEI_XML}, 'TEI to JATS'))
        return steps