def parse(self, text, url): dom = self.parseDocument(text) page = Page() page.title = self.get_text_from_element('title') page.content = self.remove_a_tags(self.get_text_from_element('body')) page.url = url def read_link(link): return URLUtils.join_relurl_to_absurl(url, link['href']) page.out_links = [read_link(link) for link in dom.select('a[href]')] page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links) return page