def _grobid_jats_xslt():
    transformer = xslt_transformer_from_file(DEFAULT_GROBID_XSLT_PATH)

    def wrapper(xml):
        xml = etree.tostring(xml)
        LOGGER.debug('tei: %s', xml)
        return transformer(xml)

    return wrapper
def _scienceparse_jats_xslt():
  transformer = xslt_transformer_from_file(SCIENCEPARSE_XSLT_PATH)

  def wrapper(content):
    if isinstance(content, dict):
      xml = json_to_xml(json.dumps(content))
    else:
      raise ValueError('unsupported type: %s (%s)' % (type(content), content))
    LOGGER.debug('input xml: %s', xml)
    output_xml = transformer(xml)
    LOGGER.debug('output xml: %s', output_xml)
    return output_xml
  return wrapper
Example #3
0
def _grobid_jats_xslt():
    transformer = xslt_transformer_from_file(DEFAULT_GROBID_XSLT_PATH)

    def wrapper(xml):
        xml_str = etree.tostring(xml)
        LOGGER.debug('tei: %s', etree.tostring(xml, pretty_print=True))
        result = transformer(xml_str)
        LOGGER.debug(
            'jats: %s',
            etree.tostring(etree.fromstring(result), pretty_print=True))
        return result

    return wrapper
Example #4
0
 def get_steps(self, config, args):
     # type: (dict, object) -> list
     steps = [ScienceParseApiStep(args.science_parse_url)]
     if not args.no_science_parse_xslt:
         xslt_transformer = xslt_transformer_from_file(
             args.science_parse_xslt_path,
             pretty_print=not args.no_science_parse_pretty_print)
         steps.append(
             FunctionPipelineStep(
                 lambda d: {
                     'content': xslt_transformer(json_to_xml(d['content'])),
                     'type': MimeTypes.JATS_XML
                 }, {MimeTypes.JSON}, 'Science Parse to JATS'))
     return steps
Example #5
0
def configure_pipeline(p, opt):
  # read the files and create a collection with filename, content tuples
  pcoll = p | ReadFileNamesAndContent(opt.input)

  # map the pdf content to xml using Grobid
  # (grobid_service either accepts the content or tuples)
  output = pcoll | beam.Map(grobid_service(
    opt.grobid_url, opt.grobid_action, start_service=opt.start_grobid_service
  ))

  if opt.xslt_path:
    output |= MapValues(xslt_transformer_from_file(opt.xslt_path))

  # change the key (filename) from pdf to xml to reflect the new content
  output |= MapKeys(lambda k: '%s/%s' % (opt.output_path, splitext(k)[0].split('/')[-1] + opt.output_suffix))

  # write the files, using the key as the filename
  output |= WriteToFile()
Example #6
0
    def get_steps(self, config, args):
        # type: (dict, object) -> list
        grobid_url = args.grobid_url
        if not grobid_url:
            grobid_url = LOCAL_GROBID_API_URL
            start_grobid_service = True
        else:
            start_grobid_service = False

        call_grobid = grobid_service(grobid_url,
                                     args.grobid_action,
                                     start_service=start_grobid_service)

        def convert_to_tei(pdf_filename, pdf_content, includes):
            return call_grobid(
                (pdf_filename, pdf_content),
                path=args.grobid_action
                or get_default_grobid_action_for_fields(includes))[1]

        steps = [
            FunctionPipelineStep(
                lambda data: {
                    StepDataProps.CONTENT:
                    convert_to_tei(pdf_filename=data[StepDataProps.FILENAME],
                                   pdf_content=data[StepDataProps.CONTENT],
                                   includes=data.get(StepDataProps.INCLUDES)),
                    StepDataProps.TYPE:
                    MimeTypes.TEI_XML
                }, {MimeTypes.PDF}, 'Convert to TEI')
        ]
        if not args.no_grobid_xslt:
            xslt_transformer = xslt_transformer_from_file(
                args.grobid_xslt_path,
                pretty_print=not args.no_grobid_pretty_print)
            steps.append(
                FunctionPipelineStep(
                    lambda d: {
                        StepDataProps.CONTENT:
                        xslt_transformer(d[StepDataProps.CONTENT]),
                        StepDataProps.TYPE:
                        MimeTypes.JATS_XML
                    }, {MimeTypes.TEI_XML}, 'TEI to JATS'))
        return steps