def test_create_text(self):
        tokens = ["this", "is", "a", "test", "\n", "\n",
                  "link", "text",
                  "\n", "header", "\n", "\n"]

        expected_result = """this is a test

link text
header

"""
        result = create_text(tokens)

        self.assertEqual(result, expected_result)
Beispiel #2
0
    def extract_article(self, document):
        """Extract the article from the page contents."""
        html_document = clean_html(html.document_fromstring(document))

        tokens = tokenize_html(html_document)

        scores = [self.scoring.score(term) for term in tokens]

        terms = extract_maximum_subsequence(tokens, scores)

        terms = format_html_tokens(terms)

        terms = [re.sub(r"\n ", "\n", term, flags=re.UNICODE)
                 for term in terms]

        contents = create_text(terms)

        return contents