def test_format_html_tokens(self): tokens = ["<p>", "this", "is", "a", "test", "</p>", "<a>", "link", "</a>", "text", "<h1>", "header", "</h1>"] expected_result = ["this", "is", "a", "test", "\n", "\n", "link", "text", "\n", "header", "\n"] result = format_html_tokens(tokens) self.assertListEqual(result, expected_result)
def extract_article(self, document): """Extract the article from the page contents.""" html_document = clean_html(html.document_fromstring(document)) tokens = tokenize_html(html_document) scores = [self.scoring.score(term) for term in tokens] terms = extract_maximum_subsequence(tokens, scores) terms = format_html_tokens(terms) terms = [re.sub(r"\n ", "\n", term, flags=re.UNICODE) for term in terms] contents = create_text(terms) return contents