def page_error(self, input_url, pages): local_url = get_local_url(self.output_url, get_url_without_hash(input_url)) self.logger.info( 'This page could not be downloaded: {0} in {1}'.format( input_url, local_url)) error_page = DocumentPage(input, None, []) pages[local_url] = error_page
def process_page(self, url): self.logger.info("Processing page: " + url) local_url = self.make_copy(get_url_without_hash(url)) local_page = urllib2.urlopen(local_url) content = local_page.read() local_page.close() parser = etree.HTMLParser(encoding=get_encoding(content)) tree = etree.fromstring(content, parser) links = self.process_page_links(tree, local_url, url) self.process_page_imgs(tree, url) page = DocumentPage(url, local_url, links) return page
def process_page_links(self, tree, local_url, url): link_tags = self.links(tree) links = [] for link_tag in link_tags: attributes = link_tag.attrib href = '' if 'href' in attributes: href = attributes['href'] link_url = get_url_without_hash(urlparse.urljoin(url, href)) local_url_to = get_local_url(self.output_url, link_url) local_url_to = get_sanitized_url(local_url_to) link = DocumentLink(link_url, local_url_to) links.append(link) else: continue return links
def get_url_without_hash(self): self.assertEqual(uu.get_url_without_hash("http://www.yo.com/foo#bar"), "http://www.you.com/foo") self.assertEqual(uu.get_url_without_hash("http://www.yo.com/foo"), "http://www.you.com/foo")
def get_url_without_hash(self): self.assertEqual(uu.get_url_without_hash('http://www.yo.com/foo#bar'), 'http://www.you.com/foo') self.assertEqual(uu.get_url_without_hash('http://www.yo.com/foo'), 'http://www.you.com/foo')