def test_create_text(self): tokens = ["this", "is", "a", "test", "\n", "\n", "link", "text", "\n", "header", "\n", "\n"] expected_result = """this is a test link text header """ result = create_text(tokens) self.assertEqual(result, expected_result)
def extract_article(self, document): """Extract the article from the page contents.""" html_document = clean_html(html.document_fromstring(document)) tokens = tokenize_html(html_document) scores = [self.scoring.score(term) for term in tokens] terms = extract_maximum_subsequence(tokens, scores) terms = format_html_tokens(terms) terms = [re.sub(r"\n ", "\n", term, flags=re.UNICODE) for term in terms] contents = create_text(terms) return contents