Example #1
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert 'RAS VS BRAF ONCOGENES AND TARGETED THERAPIES' in text
    assert unicode_strs(text)
Example #2
0
    def preprocess_nxml_file(self, filename, pmid, extra_annotations):
        """Preprocess an NXML file for use with the ISI reader.

        Preprocessing is done by extracting plain text from NXML and then
        creating a text file with one sentence per line.

        Parameters
        ----------
        filename : str
            Filename (more specifically the file path) of an nxml file to
            process
        pmid : str
            The PMID from which it comes, or None if not specified
        extra_annotations : dict
            Extra annotations to be added to each statement, possibly including
            metadata about the source (annotations with the key "interaction"
            will be overridden)
        """
        with open(filename, 'r') as fh:
            txt_content = extract_text(fh.read())

        # We need to remove some common LaTEX commands from the converted text
        # or the reader will get confused
        cmd1 = r'[^ \{\}]+\{[^\{\}]+\}\{[^\{\}]+\}'
        cmd2 = r'[^ \{\}]+\{[^\{\}]+\}'
        txt_content = re.sub(cmd1, '', txt_content)
        txt_content = re.sub(cmd2, '', txt_content)

        # Prepocess text extracted from nxml
        self.preprocess_plain_text_string(txt_content, pmid, extra_annotations)
Example #3
0
    def prep_input(self, content_iter):
        logger.info('Prepping input.')
        for content in content_iter:
            # If it's an NXML, we get the raw text and save it as new content
            if content.is_format('nxml'):
                txt = extract_text(content.get_text())
                content = \
                    Content.from_string(str(content.get_id()),
                                        'txt', txt)
            quality_issue = self._check_content(content.get_text())
            if quality_issue is not None:
                logger.warning('Skipping %s due to: %s' %
                               (content.get_id(), quality_issue))
                continue

            new_fpath = content.copy_to(self.input_dir)
            self.num_input += 1
            logger.debug('%s saved for reading by Eidos.' % new_fpath)
        return
Example #4
0
def test_extract_text2():
    xml_str = '<article><body><p><p>some text</p>a</p></body></article>'
    text = pmc_client.extract_text(xml_str)
    assert text == 'a\nsome text\n'
    assert unicode_strs(text)
Example #5
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert unicode_strs(text)
Example #6
0
def test_extract_text():
    pmc_id = '4322985'
    xml_str = pmc_client.get_xml(pmc_id)
    text = pmc_client.extract_text(xml_str)
    assert text is not None
    assert unicode_strs(text)