Exemple #1
0
    def extract(self, data, dep_results):
        url = '{0}/processFulltextDocument'.format(config.GROBID_HOST)
        files = {'input': data}
        vars = {}

        try:
            resp = requests.post(url, files=files, data=vars)
        except requests.exceptions.RequestException as ex:
            raise RunnableError('Request to Grobid server failed')

        if resp.status_code != 200:
            raise RunnableError(
                'Grobid returned status {0} instead of 200\nPossible Error:\n{1}'
                .format(resp.status_code, resp.text))

        xml_text = resp.content
        # remove namespace info from xml string
        # this is hacky but makes parsing it much much nicer down the road
        remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"')
        xml_text = remove_xmlns.sub('', xml_text)

        xml = safeET.fromstring(xml_text)

        # grobid returns TEI xml file
        return ExtractorResult(xml_result=xml)
Exemple #2
0
   def extract(self, data, dependency_results):
      results_dir = tempfile.mkdtemp() + '/'
      temp_pdf_file = extraction.utils.temp_file(data)

      try:
         command_args = [config.PDFFIGURES_PATH, '-o', results_dir, '-j', results_dir, temp_pdf_file]
         status, stdout, stderr = extraction.utils.external_process(command_args, timeout=20)
      except subprocess.TimeoutExpired:
         shutil.rmtree(results_dir)
         raise RunnableError('PDFFigures timed out while processing document')
      finally:
         os.remove(temp_pdf_file)

      if status != 0:
         raise RunnableError('PDFFigures Failure. Possible error:\n' + stderr)

      # Handle png results
      files = {}
      for path in glob.glob(results_dir + '*.png'):
         # basename looks something like this: -Figure-X.png
         # remove the hyphen and replace with a '.', because framework will add filename prefix later
         filename = '.' + os.path.basename(path)[1:]
         with open(path, 'rb') as f:
            files[filename] = f.read()

      # Handle json results
      for path in glob.glob(results_dir + '*.json'):
         filename = '.' + os.path.basename(path)[1:]
         with open(path, 'r') as f:
            files[filename] = f.read()

      shutil.rmtree(results_dir)

      return ExtractorResult(xml_result=None, files=files)
Exemple #3
0
    def extract(self, data, dependency_results):
        results_dir = tempfile.mkdtemp() + '/'
        temp_pdf_file = extraction.utils.temp_file(data)

        try:
            command_args = [
                'java', '-jar', config.ALGORITHMS_JAR_PATH,
                config.ALGORITHMS_PERL_PATH, 'f', temp_pdf_file, results_dir
            ]
            status, stdout, stderr = extraction.utils.external_process(
                command_args, timeout=20)
        except subprocess.TimeoutExpired:
            shutil.rmtree(results_dir)
            raise RunnableError(
                'Algorithms Jar timed out while processing document')
        finally:
            os.remove(temp_pdf_file)

        if status != 0:
            raise RunnableError('Algorithms Jar Failure. Possible error:\n' +
                                stderr)

        paths = glob.glob(results_dir + '*.xml')
        if len(paths) != 1:
            raise RunnableError(
                'Wrong number of results files from Algorithms Jar.')

        tree = safeET.parse(paths[0])
        xml_root = tree.getroot()

        shutil.rmtree(results_dir)

        return ExtractorResult(xml_result=xml_root)
Exemple #4
0
    def extract(self, data, dep_results):
        # Write the pdf data to a temporary location so PDFBox can process it
        file_path = extraction.utils.temp_file(data, suffix='.pdf')

        try:
            command_args = [
                'java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console',
                '-encoding', 'UTF-8', file_path
            ]
            status, stdout, stderr = extraction.utils.external_process(
                command_args, timeout=30)
        except subprocess.TimeoutExpired:
            raise RunnableError('PDFBox timed out while processing document')
        finally:
            os.remove(file_path)

        if status != 0:
            raise RunnableError(
                'PDFBox returned error status code {0}.\nPossible error:\n{1}'.
                format(status, stderr))

        # We can use result from PDFBox directly, no manipulation needed
        pdf_plain_text = stdout
        files = {'.txt': pdf_plain_text}

        return ExtractorResult(xml_result=None, files=files)
Exemple #5
0
def _call_grobid_method(data, method):
    url = '{0}/api/{1}'.format(config.GROBID_HOST, method)

    # Write the pdf data to a temporary location so Grobid can process it
    path = extraction.utils.temp_file(data, suffix='.pdf')

    files = {
        'input': (path, open(path, 'rb')),
    }

    try:
        resp = requests.post(url, files=files)
    except requests.exceptions.RequestException as ex:
        raise RunnableError('Request to Grobid server failed')
    finally:
        os.remove(path)

    if resp.status_code != 200:
        raise RunnableError(
            'Grobid returned status {0} instead of 200\nPossible Error:\n{1}'.
            format(resp.status_code, resp.text))

    # remove all namespace info from xml string
    # this is hacky but makes parsing it much much easier down the road
    #remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"')
    #xml_text = remove_xmlns.sub('', resp.content)
    #xml = safeET.fromstring(xml_text)

    xmlstring = re.sub('xmlns="[^"]+"', '', resp.content, count=1)
    xml = safeET.fromstring(xmlstring)

    return xml
Exemple #6
0
    def filter(self, data, dependency_results):
        # make a temporary directory for filter jar to read/write to
        temp_dir = tempfile.mkdtemp() + '/'

        id = 'file'
        # Write pdf file and extracted pdf text to a temporary location for filter jar to read
        pdf_text = dependency_results[
            interfaces.PlainTextExtractor].files['.txt']
        with open('{0}{1}.txt'.format(temp_dir, id), 'w') as pdf_text_file:
            pdf_text_file.write(pdf_text)
        with open('{0}{1}.pdf'.format(temp_dir, id), 'w') as pdf_file:
            pdf_file.write(data)
        shutil.copy(config.FILTER_ACL_PATH, temp_dir + 'acl')
        shutil.copy(config.FILTER_TRAIN_DATA_PATH,
                    temp_dir + 'train_str_f43_paper.arff')

        try:
            status, stdout, stderr = extraction.utils.external_process(
                [
                    'java', '-jar', config.FILTER_JAR_PATH, temp_dir, id,
                    'paper'
                ],
                timeout=20)
        except subprocess.TimeoutExpired as te:
            raise RunnableError(
                'Filter Jar timed out while processing document')
        finally:
            shutil.rmtree(temp_dir)

        if status != 0:
            raise RunnableError(
                'Filter Jar failed to execute sucessfully. Possible error:\n' +
                stderr)

        # last line of output should be 'true' or 'false' indicating if pdf is an academic paper or not

        # get rid of possible trailing blank lines
        lines = [line.strip() for line in stdout.split('\n') if line.strip()]
        result = lines[-1]
        if result.lower() == 'true':
            return True
        elif result.lower() == 'false':
            return False
        else:
            raise RunnableError(
                'Last line of output from Jar should be either "true" or "false". Instead was: '
                + result)
Exemple #7
0
def _call_grobid_method(data, method):
      url = '{0}/{1}'.format(config.GROBID_HOST, method)
      files = {'input': data}
      vars = {}

      try:
         resp = requests.post(url, files=files, data=vars)
      except requests.exceptions.RequestException as ex:
         raise RunnableError('Request to Grobid server failed')

      if resp.status_code != 200:
         raise RunnableError('Grobid returned status {0} instead of 200\nPossible Error:\n{1}'.format(resp.status_code, resp.text))

      # remove all namespace info from xml string
      # this is hacky but makes parsing it much much easier down the road
      remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"')
      xml_text = remove_xmlns.sub('', resp.content)

      xml = safeET.fromstring(xml_text)

      return xml
Exemple #8
0
    def extract(self, data, dependency_results):
        # Get the plain text file of the PDF and write it to a temporary location
        pdf_text = dependency_results[
            interfaces.PlainTextExtractor].files['.txt']
        text_file_path = extraction.utils.temp_file(pdf_text)

        # Run parscit on the text file to extract citations
        try:
            status, stdout, stderr = extraction.utils.external_process(
                ['perl', config.PARSCIT_PATH, text_file_path], timeout=20)
        except subprocess.TimeoutExpired as te:
            raise RunnableError('ParsCit timed out while processing document')
        finally:
            os.remove(text_file_path)

        if status != 0:
            raise RunnableError('ParsCit Failure. Possible error:\n' + stderr)

        # ParsCit will give us a string representing an xml doc
        # convert from string type  into an xml object
        xml = safeET.fromstring(stdout)

        return ExtractorResult(xml_result=xml)
Exemple #9
0
   def extract(self, data, dep_results):
      file_path = utils.temp_file(data, suffix='.pdf')
      
      try:
         status, stdout, stderr = utils.external_process(['java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path],
               timeout=30)
      except subprocess.TimeoutExpired as te:
         raise RunnableError('PDFBox timed out while processing document')
      finally:
         os.remove(file_path)

      if status != 0:
         raise RunnableError('PDFBox returned error status code {0}.\nPossible error:\n{1}'.format(status, stderr))

      plain_text = stdout

      # create xml result file that just points towards the file with plain text results
      root=ET.Element('file')
      root.text = 'plain_text.txt'

      files = {'plain_text.txt': stdout}

      return ExtractorResult(xml_result=root, files=files)
Exemple #10
0
    def extract(self, data, deps):
        try:
            (status, stdout, stderr) = utils.external_process(
                ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5)
        except subprocess.TimeoutExpired:
            raise RunnableError('awk timed out')

        lines = [line for line in stdout.split("\n") if line]

        root = ET.Element('extraction')
        for line in lines:
            ele = ET.SubElement(root, 'line')
            ele.text = line

        return ExtractorResult(xml_result=root)
Exemple #11
0
    def extract(self, data, dependency_results):
        xml_root = dependency_results[
            interfaces.FullTextTEIExtractor].xml_result
        body_node = xml_root.find('./text/body')

        if body_node is None:
            return RunnableError('Could not find body text in TEI xml file')

        xml_string = ET.tostring(body_node).decode('utf-8')

        plain_text = utils.xml_to_plain_text(xml_string)

        plain_text = plain_text.encode('utf-8')
        files = {'.txt': plain_text}

        return ExtractorResult(xml_result=None, files=files)
Exemple #12
0
    def extract(self, data, dep_results):
        tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result
        result_root = ET.Element('algorithm', {
            'name': 'Grobid Header Extraction',
            'version': '0.1'
        })

        # Retrieve title from TEI doc
        title = tei_root.find('./teiHeader//titleStmt/title')
        if title is not None:
            ET.SubElement(result_root, 'title').text = title.text
        else:
            raise RunnableError('No title found in TEI document')

        # Find document-level affiliations
        affiliations = tei_root.findall(
            './teiHeader//sourceDesc/biblStruct/analytic/affiliation')
        if affiliations:
            affiliation_str = " | ".join(
                map(_get_affiliation_str, affiliations))
            ET.SubElement(result_root, 'affiliation').text = affiliation_str

        # Retreive author names from TEI doc
        authors = tei_root.findall('./teiHeader//biblStruct//author')
        authors_node = ET.SubElement(result_root, 'authors')
        if authors is not None and len(authors):
            for author in authors:
                author_node = ET.SubElement(authors_node, 'author')

                # Find and output name-related info
                name_tags = []
                name_tags.extend(author.findall("./persName/forename"))
                name_tags.extend(author.findall('./persName/surname'))

                name_parts = [
                    name.text for name in name_tags if name is not None
                ]
                name = ' '.join(name_parts)
                ET.SubElement(author_node, 'name').text = name

                # Find and output affilliation-related info
                affiliations = author.findall('./affiliation')
                if affiliations:
                    # Use a pipe to delimit seperate affiliations
                    affiliation_str = " | ".join(
                        map(_get_affiliation_str, affiliations))
                    ET.SubElement(author_node,
                                  'affiliation').text = affiliation_str

        else:
            self.log('No authors found')

        # Retreive keywords from TEI doc
        keywords = tei_root.findall('./teiHeader//keywords//item/term')
        keywords_node = ET.SubElement(result_root, 'keywords')
        if keywords is not None and len(keywords):
            for term in keywords:
                ET.SubElement(keywords_node, 'keyword').text = term.text
        else:
            self.log('No keywords found')

        # Try and find an abstract
        divs = tei_root.findall('./text//div')
        abstracts = [div for div in divs if div.get('type') == 'abstract']
        if abstracts:
            abstract = abstracts[0]
            xml_string = ET.tostring(abstract)
            remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>',
                                        re.DOTALL | re.UNICODE)
            xml_string = remove_heading.sub('', xml_string)
            abstract_string = utils.xml_to_plain_text(xml_string)

            ET.SubElement(result_root, 'abstract').text = abstract_string
        else:
            self.log('No abstract found')

        # CSX style xml document of header information
        return ExtractorResult(xml_result=result_root)
Exemple #13
0
 def extract(self, data, dep_results):
     raise RunnableError('This Extractor Should Be Extended')
Exemple #14
0
 def extract(self, data, dep_results):
     return RunnableError('This extractor should never run!')
Exemple #15
0
 def extract(self, data, dep_results):
     raise RunnableError('I always Error!')