Beispiel #1
0
 def page_error(self, input_url, pages):
     local_url = get_local_url(self.output_url,
             get_url_without_hash(input_url))
     self.logger.info(
         'This page could not be downloaded: {0} in {1}'.format(
             input_url, local_url))
     error_page = DocumentPage(input, None, [])
     pages[local_url] = error_page
Beispiel #2
0
 def page_error(self, input_url, pages):
     local_url = get_local_url(self.output_url,
                               get_url_without_hash(input_url))
     self.logger.info(
         'This page could not be downloaded: {0} in {1}'.format(
             input_url, local_url))
     error_page = DocumentPage(input, None, [])
     pages[local_url] = error_page
Beispiel #3
0
    def process_page(self, url):
        self.logger.info("Processing page: " + url)
        local_url = self.make_copy(get_url_without_hash(url))

        local_page = urllib2.urlopen(local_url)
        content = local_page.read()
        local_page.close()
        parser = etree.HTMLParser(encoding=get_encoding(content))
        tree = etree.fromstring(content, parser)

        links = self.process_page_links(tree, local_url, url)
        self.process_page_imgs(tree, url)

        page = DocumentPage(url, local_url, links)

        return page
Beispiel #4
0
 def process_page_links(self, tree, local_url, url):
     link_tags = self.links(tree)
     links = []
     for link_tag in link_tags:
         attributes = link_tag.attrib
         href = ''
         if 'href' in attributes:
             href = attributes['href']
             link_url = get_url_without_hash(urlparse.urljoin(url, href))
             local_url_to = get_local_url(self.output_url, link_url)
             local_url_to = get_sanitized_url(local_url_to)
             link = DocumentLink(link_url, local_url_to)
             links.append(link)
         else:
             continue
     return links
Beispiel #5
0
    def process_page(self, url):
        self.logger.info("Processing page: " + url)
        local_url = self.make_copy(get_url_without_hash(url))

        local_page = urllib2.urlopen(local_url)
        content = local_page.read()
        local_page.close()
        parser = etree.HTMLParser(encoding=get_encoding(content))
        tree = etree.fromstring(content, parser)

        links = self.process_page_links(tree, local_url, url)
        self.process_page_imgs(tree, url)

        page = DocumentPage(url, local_url, links)

        return page
Beispiel #6
0
 def process_page_links(self, tree, local_url, url):
     link_tags = self.links(tree)
     links = []
     for link_tag in link_tags:
         attributes = link_tag.attrib
         href = ''
         if 'href' in attributes:
             href = attributes['href']
             link_url = get_url_without_hash(urlparse.urljoin(url, href))
             local_url_to = get_local_url(self.output_url, link_url)
             local_url_to = get_sanitized_url(local_url_to)
             link = DocumentLink(link_url, local_url_to)
             links.append(link)
         else:
             continue
     return links
Beispiel #7
0
 def get_url_without_hash(self):
     self.assertEqual(uu.get_url_without_hash("http://www.yo.com/foo#bar"), "http://www.you.com/foo")
     self.assertEqual(uu.get_url_without_hash("http://www.yo.com/foo"), "http://www.you.com/foo")
Beispiel #8
0
 def get_url_without_hash(self):
     self.assertEqual(uu.get_url_without_hash('http://www.yo.com/foo#bar'),
                      'http://www.you.com/foo')
     self.assertEqual(uu.get_url_without_hash('http://www.yo.com/foo'),
                      'http://www.you.com/foo')