Example #1
0
    def parse(self, text, url):
        dom = self.parseDocument(text)
        page = Page()
        page.title = self.get_text_from_element('title')
        page.content = self.remove_a_tags(self.get_text_from_element('body'))
        page.url = url

        def read_link(link):
            return URLUtils.join_relurl_to_absurl(url, link['href'])

        page.out_links = [read_link(link) for link in dom.select('a[href]')]
        page.out_links = ListUtil.to_list_without_duplicated_entries(page.out_links)

        return page