def test_format_html_tokens(self):
        tokens = ["<p>", "this", "is", "a", "test", "</p>",
                  "<a>", "link", "</a>", "text",
                  "<h1>", "header", "</h1>"]

        expected_result = ["this", "is", "a", "test", 
                           "\n", "\n", 
                           "link", "text", 
                           "\n", 
                           "header", 
                           "\n"]

        result = format_html_tokens(tokens)

        self.assertListEqual(result, expected_result)
Example #2
0
    def extract_article(self, document):
        """Extract the article from the page contents."""
        html_document = clean_html(html.document_fromstring(document))

        tokens = tokenize_html(html_document)

        scores = [self.scoring.score(term) for term in tokens]

        terms = extract_maximum_subsequence(tokens, scores)

        terms = format_html_tokens(terms)

        terms = [re.sub(r"\n ", "\n", term, flags=re.UNICODE)
                 for term in terms]

        contents = create_text(terms)

        return contents