def __query_to_tokens(self, query): """ Split the query into tokens """ page = Page() page.content = query lexer = TokenLexer(page) return list(lexer.tokens())
def test_tokens_AllExpectedTokensAreReturned(self): page = Page() page.content = "yEaH,PYthON?!,. is not . so,!? bad, as i thought:." lexer = TokenLexer(page) tokens = list(lexer.tokens()) self.assertListEqual(tokens, [ 'yeah', 'python', 'not', 'bad', 'i', 'thought' ])
def test_str_outputStringIsAsExpected(self): test_page = Page() test_page.title = 'D01' test_page.content = 'Bla Bla Blub' expected_output = os.linesep.join([ '---------------------------------------------------------------------------', 'D01', '---------------------------------------------------------------------------', 'Bla Bla Blub' ]) self.assertEqual(expected_output, str(test_page))
def parse(self, text, url): dom = self.parseDocument(text) page = Page() page.title = self.get_text_from_element('title') page.content = self.remove_a_tags(self.get_text_from_element('body')) page.url = url def read_link(link): return URLUtils.join_relurl_to_absurl(url, link['href']) page.out_links = [read_link(link) for link in dom.select('a[href]')] page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links) return page