Example #1
0
 def __query_to_tokens(self, query):
     """
     Split the query into tokens
     """
     page = Page()
     page.content = query
     lexer = TokenLexer(page)
     return list(lexer.tokens())
Example #2
0
    def test_tokens_AllExpectedTokensAreReturned(self):
        page = Page()
        page.content = "yEaH,PYthON?!,. is   not  . so,!? bad, as i thought:."

        lexer = TokenLexer(page)
        tokens = list(lexer.tokens())

        self.assertListEqual(tokens, [
            'yeah', 'python', 'not',
            'bad', 'i', 'thought'
        ])
Example #3
0
    def test_str_outputStringIsAsExpected(self):
        test_page = Page()
        test_page.title = 'D01'
        test_page.content = 'Bla Bla Blub'

        expected_output = os.linesep.join([
            '---------------------------------------------------------------------------',
            'D01',
            '---------------------------------------------------------------------------',
            'Bla Bla Blub'
        ])

        self.assertEqual(expected_output, str(test_page))
Example #4
0
    def parse(self, text, url):
        dom = self.parseDocument(text)
        page = Page()
        page.title = self.get_text_from_element('title')
        page.content = self.remove_a_tags(self.get_text_from_element('body'))
        page.url = url

        def read_link(link):
            return URLUtils.join_relurl_to_absurl(url, link['href'])

        page.out_links = [read_link(link) for link in dom.select('a[href]')]
        page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links)

        return page