Esempio n. 1
0
    def extract(self, data, dependency_results):
        results_dir = tempfile.mkdtemp() + '/'
        temp_pdf_file = extraction.utils.temp_file(data)

        try:
            command_args = [
                'java', '-jar', config.ALGORITHMS_JAR_PATH,
                config.ALGORITHMS_PERL_PATH, 'f', temp_pdf_file, results_dir
            ]
            status, stdout, stderr = extraction.utils.external_process(
                command_args, timeout=20)
        except subprocess.TimeoutExpired:
            shutil.rmtree(results_dir)
            raise RunnableError(
                'Algorithms Jar timed out while processing document')
        finally:
            os.remove(temp_pdf_file)

        if status != 0:
            raise RunnableError('Algorithms Jar Failure. Possible error:\n' +
                                stderr)

        paths = glob.glob(results_dir + '*.xml')
        if len(paths) != 1:
            raise RunnableError(
                'Wrong number of results files from Algorithms Jar.')

        tree = safeET.parse(paths[0])
        xml_root = tree.getroot()

        shutil.rmtree(results_dir)

        return ExtractorResult(xml_result=xml_root)
Esempio n. 2
0
   def extract(self, data, dependency_results):
      results_dir = tempfile.mkdtemp() + '/'
      temp_pdf_file = extraction.utils.temp_file(data)

      try:
         command_args = [config.PDFFIGURES_PATH, '-o', results_dir, '-j', results_dir, temp_pdf_file]
         status, stdout, stderr = extraction.utils.external_process(command_args, timeout=20)
      except subprocess.TimeoutExpired:
         shutil.rmtree(results_dir)
         raise RunnableError('PDFFigures timed out while processing document')
      finally:
         os.remove(temp_pdf_file)

      if status != 0:
         raise RunnableError('PDFFigures Failure. Possible error:\n' + stderr)

      # Handle png results
      files = {}
      for path in glob.glob(results_dir + '*.png'):
         # basename looks something like this: -Figure-X.png
         # remove the hyphen and replace with a '.', because framework will add filename prefix later
         filename = '.' + os.path.basename(path)[1:]
         with open(path, 'rb') as f:
            files[filename] = f.read()

      # Handle json results
      for path in glob.glob(results_dir + '*.json'):
         filename = '.' + os.path.basename(path)[1:]
         with open(path, 'r') as f:
            files[filename] = f.read()

      shutil.rmtree(results_dir)

      return ExtractorResult(xml_result=None, files=files)
Esempio n. 3
0
    def extract(self, data, dep_results):
        # Write the pdf data to a temporary location so PDFBox can process it
        file_path = extraction.utils.temp_file(data, suffix='.pdf')

        try:
            command_args = [
                'java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console',
                '-encoding', 'UTF-8', file_path
            ]
            status, stdout, stderr = extraction.utils.external_process(
                command_args, timeout=30)
        except subprocess.TimeoutExpired:
            raise RunnableError('PDFBox timed out while processing document')
        finally:
            os.remove(file_path)

        if status != 0:
            raise RunnableError(
                'PDFBox returned error status code {0}.\nPossible error:\n{1}'.
                format(status, stderr))

        # We can use result from PDFBox directly, no manipulation needed
        pdf_plain_text = stdout
        files = {'.txt': pdf_plain_text}

        return ExtractorResult(xml_result=None, files=files)
Esempio n. 4
0
    def extract(self, data, dep_results):
        url = '{0}/processFulltextDocument'.format(config.GROBID_HOST)
        files = {'input': data}
        vars = {}

        try:
            resp = requests.post(url, files=files, data=vars)
        except requests.exceptions.RequestException as ex:
            raise RunnableError('Request to Grobid server failed')

        if resp.status_code != 200:
            raise RunnableError(
                'Grobid returned status {0} instead of 200\nPossible Error:\n{1}'
                .format(resp.status_code, resp.text))

        xml_text = resp.content
        # remove namespace info from xml string
        # this is hacky but makes parsing it much much nicer down the road
        remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"')
        xml_text = remove_xmlns.sub('', xml_text)

        xml = safeET.fromstring(xml_text)

        # grobid returns TEI xml file
        return ExtractorResult(xml_result=xml)
Esempio n. 5
0
    def extract(self, data, deps):
        emails = re.findall(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b',
                            data, re.IGNORECASE | re.UNICODE)
        root = ET.Element('extraction')
        for email in emails:
            ele = ET.SubElement(root, 'email')
            ele.text = email

        return ExtractorResult(xml_result=root)
Esempio n. 6
0
    def extract(self, data, deps):
        try:
            (status, stdout, stderr) = utils.external_process(
                ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5)
        except subprocess.TimeoutExpired:
            raise RunnableError('awk timed out')

        lines = [line for line in stdout.split("\n") if line]

        root = ET.Element('extraction')
        for line in lines:
            ele = ET.SubElement(root, 'line')
            ele.text = line

        return ExtractorResult(xml_result=root)
Esempio n. 7
0
    def extract(self, data, dependency_results):
        xml_root = dependency_results[
            interfaces.FullTextTEIExtractor].xml_result
        body_node = xml_root.find('./text/body')

        if body_node is None:
            return RunnableError('Could not find body text in TEI xml file')

        xml_string = ET.tostring(body_node).decode('utf-8')

        plain_text = utils.xml_to_plain_text(xml_string)

        plain_text = plain_text.encode('utf-8')
        files = {'.txt': plain_text}

        return ExtractorResult(xml_result=None, files=files)
Esempio n. 8
0
    def extract(self, data, dependency_results):
        # Get the plain text file of the PDF and write it to a temporary location
        pdf_text = dependency_results[
            interfaces.PlainTextExtractor].files['.txt']
        text_file_path = extraction.utils.temp_file(pdf_text)

        # Run parscit on the text file to extract citations
        try:
            status, stdout, stderr = extraction.utils.external_process(
                ['perl', config.PARSCIT_PATH, text_file_path], timeout=20)
        except subprocess.TimeoutExpired as te:
            raise RunnableError('ParsCit timed out while processing document')
        finally:
            os.remove(text_file_path)

        if status != 0:
            raise RunnableError('ParsCit Failure. Possible error:\n' + stderr)

        # ParsCit will give us a string representing an xml doc
        # convert from string type  into an xml object
        xml = safeET.fromstring(stdout)

        return ExtractorResult(xml_result=xml)
Esempio n. 9
0
   def extract(self, data, dep_results):
      file_path = utils.temp_file(data, suffix='.pdf')
      
      try:
         status, stdout, stderr = utils.external_process(['java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path],
               timeout=30)
      except subprocess.TimeoutExpired as te:
         raise RunnableError('PDFBox timed out while processing document')
      finally:
         os.remove(file_path)

      if status != 0:
         raise RunnableError('PDFBox returned error status code {0}.\nPossible error:\n{1}'.format(status, stderr))

      plain_text = stdout

      # create xml result file that just points towards the file with plain text results
      root=ET.Element('file')
      root.text = 'plain_text.txt'

      files = {'plain_text.txt': stdout}

      return ExtractorResult(xml_result=root, files=files)
Esempio n. 10
0
    def extract(self, data, dep_results):
        tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result
        result_root = ET.Element('algorithm', {
            'name': 'Grobid Header Extraction',
            'version': '0.1'
        })

        # Retrieve title from TEI doc
        title = tei_root.find('./teiHeader//titleStmt/title')
        if title is not None:
            ET.SubElement(result_root, 'title').text = title.text
        else:
            raise RunnableError('No title found in TEI document')

        # Find document-level affiliations
        affiliations = tei_root.findall(
            './teiHeader//sourceDesc/biblStruct/analytic/affiliation')
        if affiliations:
            affiliation_str = " | ".join(
                map(_get_affiliation_str, affiliations))
            ET.SubElement(result_root, 'affiliation').text = affiliation_str

        # Retreive author names from TEI doc
        authors = tei_root.findall('./teiHeader//biblStruct//author')
        authors_node = ET.SubElement(result_root, 'authors')
        if authors is not None and len(authors):
            for author in authors:
                author_node = ET.SubElement(authors_node, 'author')

                # Find and output name-related info
                name_tags = []
                name_tags.extend(author.findall("./persName/forename"))
                name_tags.extend(author.findall('./persName/surname'))

                name_parts = [
                    name.text for name in name_tags if name is not None
                ]
                name = ' '.join(name_parts)
                ET.SubElement(author_node, 'name').text = name

                # Find and output affilliation-related info
                affiliations = author.findall('./affiliation')
                if affiliations:
                    # Use a pipe to delimit seperate affiliations
                    affiliation_str = " | ".join(
                        map(_get_affiliation_str, affiliations))
                    ET.SubElement(author_node,
                                  'affiliation').text = affiliation_str

        else:
            self.log('No authors found')

        # Retreive keywords from TEI doc
        keywords = tei_root.findall('./teiHeader//keywords//item/term')
        keywords_node = ET.SubElement(result_root, 'keywords')
        if keywords is not None and len(keywords):
            for term in keywords:
                ET.SubElement(keywords_node, 'keyword').text = term.text
        else:
            self.log('No keywords found')

        # Try and find an abstract
        divs = tei_root.findall('./text//div')
        abstracts = [div for div in divs if div.get('type') == 'abstract']
        if abstracts:
            abstract = abstracts[0]
            xml_string = ET.tostring(abstract)
            remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>',
                                        re.DOTALL | re.UNICODE)
            xml_string = remove_heading.sub('', xml_string)
            abstract_string = utils.xml_to_plain_text(xml_string)

            ET.SubElement(result_root, 'abstract').text = abstract_string
        else:
            self.log('No abstract found')

        # CSX style xml document of header information
        return ExtractorResult(xml_result=result_root)
Esempio n. 11
0
 def extract(self, data, dep_results):
     xml = _call_grobid_method(data, 'processReferences')
     return ExtractorResult(xml_result=xml)
Esempio n. 12
0
 def extract(self, data, dep_results):
     xml = _call_grobid_method(data, 'processHeaderDocument')
     return ExtractorResult(xml_result=xml)
Esempio n. 13
0
 def extract(self, data, dep_results):
     ele = ET.Element('file')
     ele.text = 'test.txt'
     files = {'test.txt': 'test test'}
     return ExtractorResult(ele, files=files)
Esempio n. 14
0
 def extract(self, data, dep_results):
     ele = ET.Element('result')
     ele.text = data
     return ExtractorResult(ele)
Esempio n. 15
0
 def extract(self, data, dep_results):
     return ExtractorResult(xml_result=None)