def test_paragraphs2(self): """Test paragraphs are correctly split with no closing element.""" r = HtmlReader() d = r.parse('<p>First para<p>Second Para') self.assertEqual(len(d.elements), 2) self.assertEqual(d.elements[0].text, 'First para') self.assertEqual(d.elements[1].text, 'Second Para') for el in d.elements: self.assertIsInstance(el, Paragraph)
def test_linebreak2(self): """Test br splits paragraph.""" r = HtmlReader() d = r.parse('<span>First line</span><br/><span>Second line</span>') self.assertEqual(len(d.elements), 2) self.assertEqual(d.elements[0].text, 'First line') self.assertEqual(d.elements[1].text, 'Second line') for el in d.elements: self.assertIsInstance(el, Paragraph)
def read_html_paper(paper_path): """Opens a HTML paper and stores it as a chemdataextractor Document""" f = open(paper_path, 'rb') doc = Document.from_file(f, readers=[HtmlReader()]) return doc
def extract_sentences(paper_path, para_yes): """extracts sentences from a paper into two lists, given that para_yes contains a list of document element numbers corresponding to paragraphs manually identified as those containing synthesis information""" f = open(paper_path, 'rb') doc = Document.from_file(f, readers=[HtmlReader()]) sen_yes_arr = list() sen_no_arr = list() elem_all = np.arange(0, len(doc)) para_no = np.delete(elem_all, para_yes) for i in para_no: if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph: for sentence in doc.elements[i]: sen_no_arr.append(sentence) for i in para_yes: if type(doc.elements[i]) == chemdataextractor.doc.text.Paragraph: for sentence in doc.elements[i]: sen_yes_arr.append(sentence) return sen_yes_arr, sen_no_arr