def extract(self, data, dep_results): url = '{0}/processFulltextDocument'.format(config.GROBID_HOST) files = {'input': data} vars = {} try: resp = requests.post(url, files=files, data=vars) except requests.exceptions.RequestException as ex: raise RunnableError('Request to Grobid server failed') if resp.status_code != 200: raise RunnableError( 'Grobid returned status {0} instead of 200\nPossible Error:\n{1}' .format(resp.status_code, resp.text)) xml_text = resp.content # remove namespace info from xml string # this is hacky but makes parsing it much much nicer down the road remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"') xml_text = remove_xmlns.sub('', xml_text) xml = safeET.fromstring(xml_text) # grobid returns TEI xml file return ExtractorResult(xml_result=xml)
def extract(self, data, dependency_results): results_dir = tempfile.mkdtemp() + '/' temp_pdf_file = extraction.utils.temp_file(data) try: command_args = [config.PDFFIGURES_PATH, '-o', results_dir, '-j', results_dir, temp_pdf_file] status, stdout, stderr = extraction.utils.external_process(command_args, timeout=20) except subprocess.TimeoutExpired: shutil.rmtree(results_dir) raise RunnableError('PDFFigures timed out while processing document') finally: os.remove(temp_pdf_file) if status != 0: raise RunnableError('PDFFigures Failure. Possible error:\n' + stderr) # Handle png results files = {} for path in glob.glob(results_dir + '*.png'): # basename looks something like this: -Figure-X.png # remove the hyphen and replace with a '.', because framework will add filename prefix later filename = '.' + os.path.basename(path)[1:] with open(path, 'rb') as f: files[filename] = f.read() # Handle json results for path in glob.glob(results_dir + '*.json'): filename = '.' + os.path.basename(path)[1:] with open(path, 'r') as f: files[filename] = f.read() shutil.rmtree(results_dir) return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dependency_results): results_dir = tempfile.mkdtemp() + '/' temp_pdf_file = extraction.utils.temp_file(data) try: command_args = [ 'java', '-jar', config.ALGORITHMS_JAR_PATH, config.ALGORITHMS_PERL_PATH, 'f', temp_pdf_file, results_dir ] status, stdout, stderr = extraction.utils.external_process( command_args, timeout=20) except subprocess.TimeoutExpired: shutil.rmtree(results_dir) raise RunnableError( 'Algorithms Jar timed out while processing document') finally: os.remove(temp_pdf_file) if status != 0: raise RunnableError('Algorithms Jar Failure. Possible error:\n' + stderr) paths = glob.glob(results_dir + '*.xml') if len(paths) != 1: raise RunnableError( 'Wrong number of results files from Algorithms Jar.') tree = safeET.parse(paths[0]) xml_root = tree.getroot() shutil.rmtree(results_dir) return ExtractorResult(xml_result=xml_root)
def extract(self, data, dep_results): # Write the pdf data to a temporary location so PDFBox can process it file_path = extraction.utils.temp_file(data, suffix='.pdf') try: command_args = [ 'java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path ] status, stdout, stderr = extraction.utils.external_process( command_args, timeout=30) except subprocess.TimeoutExpired: raise RunnableError('PDFBox timed out while processing document') finally: os.remove(file_path) if status != 0: raise RunnableError( 'PDFBox returned error status code {0}.\nPossible error:\n{1}'. format(status, stderr)) # We can use result from PDFBox directly, no manipulation needed pdf_plain_text = stdout files = {'.txt': pdf_plain_text} return ExtractorResult(xml_result=None, files=files)
def _call_grobid_method(data, method): url = '{0}/api/{1}'.format(config.GROBID_HOST, method) # Write the pdf data to a temporary location so Grobid can process it path = extraction.utils.temp_file(data, suffix='.pdf') files = { 'input': (path, open(path, 'rb')), } try: resp = requests.post(url, files=files) except requests.exceptions.RequestException as ex: raise RunnableError('Request to Grobid server failed') finally: os.remove(path) if resp.status_code != 200: raise RunnableError( 'Grobid returned status {0} instead of 200\nPossible Error:\n{1}'. format(resp.status_code, resp.text)) # remove all namespace info from xml string # this is hacky but makes parsing it much much easier down the road #remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"') #xml_text = remove_xmlns.sub('', resp.content) #xml = safeET.fromstring(xml_text) xmlstring = re.sub('xmlns="[^"]+"', '', resp.content, count=1) xml = safeET.fromstring(xmlstring) return xml
def filter(self, data, dependency_results): # make a temporary directory for filter jar to read/write to temp_dir = tempfile.mkdtemp() + '/' id = 'file' # Write pdf file and extracted pdf text to a temporary location for filter jar to read pdf_text = dependency_results[ interfaces.PlainTextExtractor].files['.txt'] with open('{0}{1}.txt'.format(temp_dir, id), 'w') as pdf_text_file: pdf_text_file.write(pdf_text) with open('{0}{1}.pdf'.format(temp_dir, id), 'w') as pdf_file: pdf_file.write(data) shutil.copy(config.FILTER_ACL_PATH, temp_dir + 'acl') shutil.copy(config.FILTER_TRAIN_DATA_PATH, temp_dir + 'train_str_f43_paper.arff') try: status, stdout, stderr = extraction.utils.external_process( [ 'java', '-jar', config.FILTER_JAR_PATH, temp_dir, id, 'paper' ], timeout=20) except subprocess.TimeoutExpired as te: raise RunnableError( 'Filter Jar timed out while processing document') finally: shutil.rmtree(temp_dir) if status != 0: raise RunnableError( 'Filter Jar failed to execute sucessfully. Possible error:\n' + stderr) # last line of output should be 'true' or 'false' indicating if pdf is an academic paper or not # get rid of possible trailing blank lines lines = [line.strip() for line in stdout.split('\n') if line.strip()] result = lines[-1] if result.lower() == 'true': return True elif result.lower() == 'false': return False else: raise RunnableError( 'Last line of output from Jar should be either "true" or "false". Instead was: ' + result)
def _call_grobid_method(data, method): url = '{0}/{1}'.format(config.GROBID_HOST, method) files = {'input': data} vars = {} try: resp = requests.post(url, files=files, data=vars) except requests.exceptions.RequestException as ex: raise RunnableError('Request to Grobid server failed') if resp.status_code != 200: raise RunnableError('Grobid returned status {0} instead of 200\nPossible Error:\n{1}'.format(resp.status_code, resp.text)) # remove all namespace info from xml string # this is hacky but makes parsing it much much easier down the road remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"') xml_text = remove_xmlns.sub('', resp.content) xml = safeET.fromstring(xml_text) return xml
def extract(self, data, dependency_results): # Get the plain text file of the PDF and write it to a temporary location pdf_text = dependency_results[ interfaces.PlainTextExtractor].files['.txt'] text_file_path = extraction.utils.temp_file(pdf_text) # Run parscit on the text file to extract citations try: status, stdout, stderr = extraction.utils.external_process( ['perl', config.PARSCIT_PATH, text_file_path], timeout=20) except subprocess.TimeoutExpired as te: raise RunnableError('ParsCit timed out while processing document') finally: os.remove(text_file_path) if status != 0: raise RunnableError('ParsCit Failure. Possible error:\n' + stderr) # ParsCit will give us a string representing an xml doc # convert from string type into an xml object xml = safeET.fromstring(stdout) return ExtractorResult(xml_result=xml)
def extract(self, data, dep_results): file_path = utils.temp_file(data, suffix='.pdf') try: status, stdout, stderr = utils.external_process(['java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path], timeout=30) except subprocess.TimeoutExpired as te: raise RunnableError('PDFBox timed out while processing document') finally: os.remove(file_path) if status != 0: raise RunnableError('PDFBox returned error status code {0}.\nPossible error:\n{1}'.format(status, stderr)) plain_text = stdout # create xml result file that just points towards the file with plain text results root=ET.Element('file') root.text = 'plain_text.txt' files = {'plain_text.txt': stdout} return ExtractorResult(xml_result=root, files=files)
def extract(self, data, deps): try: (status, stdout, stderr) = utils.external_process( ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5) except subprocess.TimeoutExpired: raise RunnableError('awk timed out') lines = [line for line in stdout.split("\n") if line] root = ET.Element('extraction') for line in lines: ele = ET.SubElement(root, 'line') ele.text = line return ExtractorResult(xml_result=root)
def extract(self, data, dependency_results): xml_root = dependency_results[ interfaces.FullTextTEIExtractor].xml_result body_node = xml_root.find('./text/body') if body_node is None: return RunnableError('Could not find body text in TEI xml file') xml_string = ET.tostring(body_node).decode('utf-8') plain_text = utils.xml_to_plain_text(xml_string) plain_text = plain_text.encode('utf-8') files = {'.txt': plain_text} return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dep_results): tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result result_root = ET.Element('algorithm', { 'name': 'Grobid Header Extraction', 'version': '0.1' }) # Retrieve title from TEI doc title = tei_root.find('./teiHeader//titleStmt/title') if title is not None: ET.SubElement(result_root, 'title').text = title.text else: raise RunnableError('No title found in TEI document') # Find document-level affiliations affiliations = tei_root.findall( './teiHeader//sourceDesc/biblStruct/analytic/affiliation') if affiliations: affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(result_root, 'affiliation').text = affiliation_str # Retreive author names from TEI doc authors = tei_root.findall('./teiHeader//biblStruct//author') authors_node = ET.SubElement(result_root, 'authors') if authors is not None and len(authors): for author in authors: author_node = ET.SubElement(authors_node, 'author') # Find and output name-related info name_tags = [] name_tags.extend(author.findall("./persName/forename")) name_tags.extend(author.findall('./persName/surname')) name_parts = [ name.text for name in name_tags if name is not None ] name = ' '.join(name_parts) ET.SubElement(author_node, 'name').text = name # Find and output affilliation-related info affiliations = author.findall('./affiliation') if affiliations: # Use a pipe to delimit seperate affiliations affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(author_node, 'affiliation').text = affiliation_str else: self.log('No authors found') # Retreive keywords from TEI doc keywords = tei_root.findall('./teiHeader//keywords//item/term') keywords_node = ET.SubElement(result_root, 'keywords') if keywords is not None and len(keywords): for term in keywords: ET.SubElement(keywords_node, 'keyword').text = term.text else: self.log('No keywords found') # Try and find an abstract divs = tei_root.findall('./text//div') abstracts = [div for div in divs if div.get('type') == 'abstract'] if abstracts: abstract = abstracts[0] xml_string = ET.tostring(abstract) remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>', re.DOTALL | re.UNICODE) xml_string = remove_heading.sub('', xml_string) abstract_string = utils.xml_to_plain_text(xml_string) ET.SubElement(result_root, 'abstract').text = abstract_string else: self.log('No abstract found') # CSX style xml document of header information return ExtractorResult(xml_result=result_root)
def extract(self, data, dep_results): raise RunnableError('This Extractor Should Be Extended')
def extract(self, data, dep_results): return RunnableError('This extractor should never run!')
def extract(self, data, dep_results): raise RunnableError('I always Error!')