def test_extract_text(): pmc_id = '4322985' xml_str = pmc_client.get_xml(pmc_id) text = pmc_client.extract_text(xml_str) assert text is not None assert 'RAS VS BRAF ONCOGENES AND TARGETED THERAPIES' in text assert unicode_strs(text)
def preprocess_nxml_file(self, filename, pmid, extra_annotations): """Preprocess an NXML file for use with the ISI reader. Preprocessing is done by extracting plain text from NXML and then creating a text file with one sentence per line. Parameters ---------- filename : str Filename (more specifically the file path) of an nxml file to process pmid : str The PMID from which it comes, or None if not specified extra_annotations : dict Extra annotations to be added to each statement, possibly including metadata about the source (annotations with the key "interaction" will be overridden) """ with open(filename, 'r') as fh: txt_content = extract_text(fh.read()) # We need to remove some common LaTEX commands from the converted text # or the reader will get confused cmd1 = r'[^ \{\}]+\{[^\{\}]+\}\{[^\{\}]+\}' cmd2 = r'[^ \{\}]+\{[^\{\}]+\}' txt_content = re.sub(cmd1, '', txt_content) txt_content = re.sub(cmd2, '', txt_content) # Prepocess text extracted from nxml self.preprocess_plain_text_string(txt_content, pmid, extra_annotations)
def prep_input(self, content_iter): logger.info('Prepping input.') for content in content_iter: # If it's an NXML, we get the raw text and save it as new content if content.is_format('nxml'): txt = extract_text(content.get_text()) content = \ Content.from_string(str(content.get_id()), 'txt', txt) quality_issue = self._check_content(content.get_text()) if quality_issue is not None: logger.warning('Skipping %s due to: %s' % (content.get_id(), quality_issue)) continue new_fpath = content.copy_to(self.input_dir) self.num_input += 1 logger.debug('%s saved for reading by Eidos.' % new_fpath) return
def test_extract_text2(): xml_str = '<article><body><p><p>some text</p>a</p></body></article>' text = pmc_client.extract_text(xml_str) assert text == 'a\nsome text\n' assert unicode_strs(text)
def test_extract_text(): pmc_id = '4322985' xml_str = pmc_client.get_xml(pmc_id) text = pmc_client.extract_text(xml_str) assert text is not None assert unicode_strs(text)