Ejemplo n.º 1
0
    def extract_string(self, static_xpath, **kwargs):
        """
        Extracts the first matching string requested from the given xpath
        :param static_xpath: XPATH to be searched
        :param kwargs: decode and translate
        :return:
        """

        if 'decode' in kwargs:
            decode = kwargs['decode']
        else:
            decode = False

        if 'translate' in kwargs:
            translate = kwargs['translate']
        else:
            translate = False

        text_content = self.parsed_xml.xpath(static_xpath)[0].text_content()
        old = text_content
        text_content = TextCleaner(text=text_content).run(decode=decode,
                                                          translate=translate,
                                                          normalise=True)

        return text_content
Ejemplo n.º 2
0
def task_output_results(msg):
    """
    This worker will forward results to the outside
    exchange (typically an ADSMasterPipeline) to be
    incorporated into the storage

    :param msg: contains the bibliographic metadata

            {'bibcode': '....',
             'authors': [....],
             'title': '.....',
             .....
            }
    :return: no return
    """

    # Ensure we send unicode normalized trimmed text. Extractors already do this,
    # but we still have some file saved extraction that weren't cleaned.
    msg['body'] = TextCleaner(text=msg['body']).run(translate=False,
                                                    decode=True,
                                                    normalise=True,
                                                    trim=True)

    logger.debug('Will forward this record: %s', msg)
    rec = FulltextUpdate(**msg)
    logger.info("Calling app.forward_message...")
    if not app.conf['CELERY_ALWAYS_EAGER']:
        app.forward_message(rec)
Ejemplo n.º 3
0
def process_one_file(bibcode, fname, provider):
    ext = fname.split('.')[-1]
    d = {
        'bibcode': bibcode,
        'provider': provider,
        'file_format': ext,
        'ft_source': fname
    }
    extractor = extraction.StandardExtractorXML(d)
    extractor.open_xml()
    xml = extractor.parse_xml()
    sections = xml.xpath(sections_xpath) or xml.xpath(paragraphs_xpath)
    summary = sections[-1].text_content()
    sys.stderr.write("summary is of type {}\n".format(type(summary)))
    text = TextCleaner(text=summary)
    if sections:
        summary = unicode(sections[-1].text_content())
    if summary:
        text = TextCleaner(text=summary).run()
    return text
Ejemplo n.º 4
0
 def extract_multi_content(self, translate=False, decode=True):
     p = Popen([self.extract_pdf_script, self.ft_source],
               stdout=PIPE,
               stderr=PIPE)
     stdout, stderr = p.communicate()
     if p.returncode != 0:
         raise Exception(stderr)
     fulltext = TextCleaner(text=stdout).run(translate=translate,
                                             decode=decode,
                                             normalise=True)
     return {
         'fulltext': fulltext,
     }
Ejemplo n.º 5
0
    def extract_list(self, static_xpath, **kwargs):
        """
        Extracts the first matching string requested from the given xpath, but
        then returns the list of content. This function also extracts the href
        within the span rather than the list of strings. When a list of strings
        is required, then that function can be added to the data factory.

        :param static_xpath: XPATH to be searched
        :param kwargs: "info" name of the content wanted from the span,
        "decode", and "translate.
        :return:
        """

        if 'decode' in kwargs:
            decode = kwargs['decode']
        else:
            decode = False

        if 'translate' in kwargs:
            translate = kwargs['translate']
        else:
            translate = False

        data_inner = []
        try:
            span_content = kwargs['info']
        except KeyError:
            logger.error('You did not supply the info kwarg,'
                         ' returning an empty list')
            return data_inner

        text_content = self.parsed_xml.xpath(static_xpath)

        for span in text_content:
            try:
                text_content = span.attrib.get(span_content)
                text_content = TextCleaner(text=text_content).run(
                    decode=decode, translate=translate, normalise=True)

                data_inner.append(text_content)
            except KeyError:
                logger.debug(
                    'Content of type {0} not found in this span'.format(
                        span_content))
                pass
            except Exception:
                logger.error('Unexpected error, skipping')

        return data_inner
Ejemplo n.º 6
0
    def extract_multi_content(self, translate=True, decode=True):
        """
        Opens and extracts the content from the web server. Pares and returns
        the meta-data content including the full text

        :param translate: boolean, should it translate the text (see utils.py)
        :param decode: boolean, should it decode to UTF-8 (see utils.py)
        :return: no return
        """
        self.open_http()
        self.parse_http(translate=translate, decode=decode)
        self.parsed_http = TextCleaner(text=self.parsed_http).run(
            translate=translate, decode=decode, normalise=True)
        meta_out = {}
        meta_out['fulltext'] = self.parsed_http
        return meta_out
Ejemplo n.º 7
0
    def parse_text(self, translate=False, decode=False, normalise=True):
        """
        Cleans the text:
          1. Translates: removes escape characters if ASCII or unicode
          2. Decode: decodes Python string to unicode type, assuming UTF-8
          3. Normalise: convert u" to u-umlaut

        :param translate: boolean, should it translate the text (see utils.py)
        :param decode: boolean, should it decode to UTF-8 (see utils.py)
        :param normalise: boolean, should it convert text, i.e., u" to u-umlaut
        (see utils.py)
        :return: the parsed/modified text
        """

        raw_text = self.raw_text
        raw_text = TextCleaner(text=raw_text).run(translate=translate,
                                                  decode=decode,
                                                  normalise=True)

        self.parsed_text = raw_text
        return self.parsed_text
Ejemplo n.º 8
0
    def extract_multi_content(self, translate=False, decode=False):
        """
        Extracts the HTML content, and the content of all the tables mentioned
        in the HTML content, for the article given.

        :param translate: boolean, should it translate the text (see utils.py)
        :param decode: boolean, should it decode to UTF-8 (see utils.py)
        :return: dictionary of meta-data that includes the full text
        """

        self.open_html()
        self.parse_html()

        removed_content = None

        # Remove anything before introduction
        for xpath in META_CONTENT[self.meta_name]['introduction']:
            try:
                tmp = self.parsed_html.xpath(xpath)
                if tmp and len(tmp) > 0:
                    removed_content = tmp[0]  # TODO(rca): only first elem?
                    break

            except Exception:
                print Exception(traceback.format_exc())

        if removed_content is None:
            logger.debug(
                'Could not find intro for {0} (last xpath: {1})'.format(
                    self.dict_item['bibcode'], xpath))
        else:
            first_position_index = removed_content.getparent().index(
                removed_content)

            for element_tree_node in \
                    removed_content.getchildren()[:first_position_index]:

                element_tree_node.getparent().remove(element_tree_node)

        # Remove the references
        for xpath in META_CONTENT[self.meta_name]['references']:
            removed_content = None
            try:
                removed_content = self.parsed_html.xpath(xpath)[0]
                html_ul_element = removed_content.getnext()
                html_ul_element.getparent().remove(html_ul_element)
                removed_content.getparent().remove(removed_content)
                break

            except Exception:
                logger.debug('Could not find references for {0} (last xpath: '
                             '{1})'.format(self.dict_item['bibcode'], xpath))

        # Insert tables from external files
        first_parsed_html = self.parsed_html
        self.collate_tables()
        for table_name, table_root_node in self.dictionary_of_tables.items():

            table_node_to_insert = None
            logger.debug(
                'Attempting to find table contents: {0}'.format(table_name))

            for xpath in META_CONTENT[self.meta_name]['table']:

                try:
                    table_node_to_insert = \
                        table_root_node.xpath(xpath)[0].getparent()
                    break

                except AttributeError:
                    raise AttributeError('You used an incorrect method')

                except Exception:
                    raise Exception(
                        'Could not find table content for %s (last '
                        'xpath: %s)'.format((table_name, xpath)))

            logger.debug(
                'Attempting to find table links: {0}'.format(table_name))

            for xpath in META_CONTENT[self.meta_name]['table_links']:
                try:
                    logger.debug(self.parsed_html)
                    table_nodes_in_file_source = self.parsed_html.xpath(
                        xpath.replace('TABLE_NAME', table_name))
                    break

                except AttributeError:
                    raise AttributeError('You used an incorrect method',
                                         traceback.format_exc(), table_name,
                                         self.parsed_html)

                except Exception:
                    raise Exception('Could not find table links for'
                                    ' {0} (last xpath: {1})'.format(
                                        table_name,
                                        xpath.replace('TABLE_NAME',
                                                      table_name)))

            logger.debug(
                'Attempting to replace table at table links: {0}'.format(
                    table_name))

            if table_nodes_in_file_source:
                parent_node_of_table_link = \
                    table_nodes_in_file_source[0].getparent()

                parent_node_of_table_link.replace(
                    table_nodes_in_file_source[0], table_node_to_insert)
                [
                    remaining_node.getparent().remove(remaining_node)
                    for remaining_node in table_nodes_in_file_source[1:]
                ]
        try:
            for xpath in META_CONTENT[self.meta_name]['head']:
                try:
                    self.parsed_html.xpath(xpath)
                    break

                except Exception:
                    continue

        except Exception:
            pass

        string_of_all_html = " ".join([
            individual_element_tree_node
            for individual_element_tree_node in self.parsed_html.itertext()
            if individual_element_tree_node
            and not individual_element_tree_node.isspace()
        ])

        string_of_all_html = TextCleaner(text=string_of_all_html).run(
            translate=translate, decode=decode, normalise=True)

        meta_out = {'fulltext': string_of_all_html}

        return meta_out
Ejemplo n.º 9
0
                logger.exception("Grobid request exception")
            else:
                if response.status_code == 200:
                    logger.debug(
                        "Successful response from grobid server (%d bytes)",
                        len(response.content))
                    logger.debug("Successful response from grobid server: %s",
                                 response.content)
                    grobid_xml = response.text
                else:
                    logger.error("Grobid service response error (code %s): %s",
                                 response.status_code, response.text)
        else:
            logger.debug("Grobid service not defined")
        grobid_xml = TextCleaner(text=grobid_xml).run(translate=translate,
                                                      decode=decode,
                                                      normalise=True)

        return {
            'fulltext': grobid_xml,
        }


# Dictionary containing the relevant extensions for the relevant class

EXTRACTOR_FACTORY = {
    "xml": StandardExtractorXML,
    "html": StandardExtractorHTML,
    "txt": StandardExtractorBasicText,
    "ocr": StandardExtractorBasicText,
    "elsevier": StandardElsevierExtractorXML,