def extract_string(self, static_xpath, **kwargs): """ Extracts the first matching string requested from the given xpath :param static_xpath: XPATH to be searched :param kwargs: decode and translate :return: """ if 'decode' in kwargs: decode = kwargs['decode'] else: decode = False if 'translate' in kwargs: translate = kwargs['translate'] else: translate = False text_content = self.parsed_xml.xpath(static_xpath)[0].text_content() old = text_content text_content = TextCleaner(text=text_content).run(decode=decode, translate=translate, normalise=True) return text_content
def task_output_results(msg): """ This worker will forward results to the outside exchange (typically an ADSMasterPipeline) to be incorporated into the storage :param msg: contains the bibliographic metadata {'bibcode': '....', 'authors': [....], 'title': '.....', ..... } :return: no return """ # Ensure we send unicode normalized trimmed text. Extractors already do this, # but we still have some file saved extraction that weren't cleaned. msg['body'] = TextCleaner(text=msg['body']).run(translate=False, decode=True, normalise=True, trim=True) logger.debug('Will forward this record: %s', msg) rec = FulltextUpdate(**msg) logger.info("Calling app.forward_message...") if not app.conf['CELERY_ALWAYS_EAGER']: app.forward_message(rec)
def process_one_file(bibcode, fname, provider): ext = fname.split('.')[-1] d = { 'bibcode': bibcode, 'provider': provider, 'file_format': ext, 'ft_source': fname } extractor = extraction.StandardExtractorXML(d) extractor.open_xml() xml = extractor.parse_xml() sections = xml.xpath(sections_xpath) or xml.xpath(paragraphs_xpath) summary = sections[-1].text_content() sys.stderr.write("summary is of type {}\n".format(type(summary))) text = TextCleaner(text=summary) if sections: summary = unicode(sections[-1].text_content()) if summary: text = TextCleaner(text=summary).run() return text
def extract_multi_content(self, translate=False, decode=True): p = Popen([self.extract_pdf_script, self.ft_source], stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() if p.returncode != 0: raise Exception(stderr) fulltext = TextCleaner(text=stdout).run(translate=translate, decode=decode, normalise=True) return { 'fulltext': fulltext, }
def extract_list(self, static_xpath, **kwargs): """ Extracts the first matching string requested from the given xpath, but then returns the list of content. This function also extracts the href within the span rather than the list of strings. When a list of strings is required, then that function can be added to the data factory. :param static_xpath: XPATH to be searched :param kwargs: "info" name of the content wanted from the span, "decode", and "translate. :return: """ if 'decode' in kwargs: decode = kwargs['decode'] else: decode = False if 'translate' in kwargs: translate = kwargs['translate'] else: translate = False data_inner = [] try: span_content = kwargs['info'] except KeyError: logger.error('You did not supply the info kwarg,' ' returning an empty list') return data_inner text_content = self.parsed_xml.xpath(static_xpath) for span in text_content: try: text_content = span.attrib.get(span_content) text_content = TextCleaner(text=text_content).run( decode=decode, translate=translate, normalise=True) data_inner.append(text_content) except KeyError: logger.debug( 'Content of type {0} not found in this span'.format( span_content)) pass except Exception: logger.error('Unexpected error, skipping') return data_inner
def extract_multi_content(self, translate=True, decode=True): """ Opens and extracts the content from the web server. Pares and returns the meta-data content including the full text :param translate: boolean, should it translate the text (see utils.py) :param decode: boolean, should it decode to UTF-8 (see utils.py) :return: no return """ self.open_http() self.parse_http(translate=translate, decode=decode) self.parsed_http = TextCleaner(text=self.parsed_http).run( translate=translate, decode=decode, normalise=True) meta_out = {} meta_out['fulltext'] = self.parsed_http return meta_out
def parse_text(self, translate=False, decode=False, normalise=True): """ Cleans the text: 1. Translates: removes escape characters if ASCII or unicode 2. Decode: decodes Python string to unicode type, assuming UTF-8 3. Normalise: convert u" to u-umlaut :param translate: boolean, should it translate the text (see utils.py) :param decode: boolean, should it decode to UTF-8 (see utils.py) :param normalise: boolean, should it convert text, i.e., u" to u-umlaut (see utils.py) :return: the parsed/modified text """ raw_text = self.raw_text raw_text = TextCleaner(text=raw_text).run(translate=translate, decode=decode, normalise=True) self.parsed_text = raw_text return self.parsed_text
def extract_multi_content(self, translate=False, decode=False): """ Extracts the HTML content, and the content of all the tables mentioned in the HTML content, for the article given. :param translate: boolean, should it translate the text (see utils.py) :param decode: boolean, should it decode to UTF-8 (see utils.py) :return: dictionary of meta-data that includes the full text """ self.open_html() self.parse_html() removed_content = None # Remove anything before introduction for xpath in META_CONTENT[self.meta_name]['introduction']: try: tmp = self.parsed_html.xpath(xpath) if tmp and len(tmp) > 0: removed_content = tmp[0] # TODO(rca): only first elem? break except Exception: print Exception(traceback.format_exc()) if removed_content is None: logger.debug( 'Could not find intro for {0} (last xpath: {1})'.format( self.dict_item['bibcode'], xpath)) else: first_position_index = removed_content.getparent().index( removed_content) for element_tree_node in \ removed_content.getchildren()[:first_position_index]: element_tree_node.getparent().remove(element_tree_node) # Remove the references for xpath in META_CONTENT[self.meta_name]['references']: removed_content = None try: removed_content = self.parsed_html.xpath(xpath)[0] html_ul_element = removed_content.getnext() html_ul_element.getparent().remove(html_ul_element) removed_content.getparent().remove(removed_content) break except Exception: logger.debug('Could not find references for {0} (last xpath: ' '{1})'.format(self.dict_item['bibcode'], xpath)) # Insert tables from external files first_parsed_html = self.parsed_html self.collate_tables() for table_name, table_root_node in self.dictionary_of_tables.items(): table_node_to_insert = None logger.debug( 'Attempting to find table contents: {0}'.format(table_name)) for xpath in META_CONTENT[self.meta_name]['table']: try: table_node_to_insert = \ table_root_node.xpath(xpath)[0].getparent() break except AttributeError: raise AttributeError('You used an incorrect method') except Exception: raise Exception( 'Could not find table content for %s (last ' 'xpath: %s)'.format((table_name, xpath))) logger.debug( 'Attempting to find table links: {0}'.format(table_name)) for xpath in META_CONTENT[self.meta_name]['table_links']: try: logger.debug(self.parsed_html) table_nodes_in_file_source = self.parsed_html.xpath( xpath.replace('TABLE_NAME', table_name)) break except AttributeError: raise AttributeError('You used an incorrect method', traceback.format_exc(), table_name, self.parsed_html) except Exception: raise Exception('Could not find table links for' ' {0} (last xpath: {1})'.format( table_name, xpath.replace('TABLE_NAME', table_name))) logger.debug( 'Attempting to replace table at table links: {0}'.format( table_name)) if table_nodes_in_file_source: parent_node_of_table_link = \ table_nodes_in_file_source[0].getparent() parent_node_of_table_link.replace( table_nodes_in_file_source[0], table_node_to_insert) [ remaining_node.getparent().remove(remaining_node) for remaining_node in table_nodes_in_file_source[1:] ] try: for xpath in META_CONTENT[self.meta_name]['head']: try: self.parsed_html.xpath(xpath) break except Exception: continue except Exception: pass string_of_all_html = " ".join([ individual_element_tree_node for individual_element_tree_node in self.parsed_html.itertext() if individual_element_tree_node and not individual_element_tree_node.isspace() ]) string_of_all_html = TextCleaner(text=string_of_all_html).run( translate=translate, decode=decode, normalise=True) meta_out = {'fulltext': string_of_all_html} return meta_out
logger.exception("Grobid request exception") else: if response.status_code == 200: logger.debug( "Successful response from grobid server (%d bytes)", len(response.content)) logger.debug("Successful response from grobid server: %s", response.content) grobid_xml = response.text else: logger.error("Grobid service response error (code %s): %s", response.status_code, response.text) else: logger.debug("Grobid service not defined") grobid_xml = TextCleaner(text=grobid_xml).run(translate=translate, decode=decode, normalise=True) return { 'fulltext': grobid_xml, } # Dictionary containing the relevant extensions for the relevant class EXTRACTOR_FACTORY = { "xml": StandardExtractorXML, "html": StandardExtractorHTML, "txt": StandardExtractorBasicText, "ocr": StandardExtractorBasicText, "elsevier": StandardElsevierExtractorXML,