def handle_html(self, response): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request
def test_start_urls(self): specs = { "type": "pagination", "value": None, "start_urls": ['http://www.spam.com/?p=1', 'http://www.eggs.com/?page=0'] } lextractor = create_linkextractor_from_specs(specs) html = """ <a href="http://www.spam.com/?p=100">Click here 1</a> <a href="http://www.spam.com/?p=200">Click here 2</a> <a href="http://www.spam.com/?p=300">Click here 3</a> """ html_page = htmlpage_from_response( UTF8HtmlResponse(url='http://www.example.com/', body=html)) links = list(lextractor.links_to_follow(html_page)) links = sorted(links, key=lambda link: link.url) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, "http://www.spam.com/?p=100") self.assertEqual(links[1].url, "http://www.spam.com/?p=200") self.assertEqual(links[2].url, "http://www.spam.com/?p=300") self.assertEqual(links[0].text, 'Click here 1') self.assertEqual(links[1].text, 'Click here 2') self.assertEqual(links[2].text, 'Click here 3')
def _extract_links(self, response_or_htmlpage): """Extract links to follow from an html page This uses `iterlinks` to read the links in the page. """ htmlpage = htmlpage_from_response(response_or_htmlpage) if \ isinstance(response_or_htmlpage, HtmlResponse) else response_or_htmlpage return iterlinks(htmlpage)
def _extract_links(self, response_or_htmlpage): """Extract links to follow from an html page This uses `iterlinks` to read the links in the page. """ if isinstance(response_or_htmlpage, HtmlResponse): response_or_htmlpage = htmlpage_from_response(response_or_htmlpage) return iterlinks(response_or_htmlpage)
def test_simple(self): specs = {"type": "pagination", "value": None} lextractor = create_linkextractor_from_specs(specs) html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html)) html_page.headers["n_items"] = 1 links = list(lextractor.links_to_follow(html_page)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, "http://www.example.com/path") self.assertEqual(links[0].text, "Click here")
def test_simple(self): specs = {"type": "pagination", "value": None} lextractor = create_linkextractor_from_specs(specs) html_page = htmlpage_from_response( HtmlResponse(url='http://www.example.com/', body=html)) html_page.headers['n_items'] = 1 links = list(lextractor.links_to_follow(html_page)) self.assertEqual(len(links), 1) self.assertEqual(links[0].url, 'http://www.example.com/path') self.assertEqual(links[0].text, 'Click here')
def handle_html(self, response, seen=None): htmlpage = htmlpage_from_response(response) items, link_regions = self.extract_items(htmlpage) htmlpage.headers['n_items'] = len(items) try: response.meta['n_items'] = len(items) except AttributeError: pass # response not tied to any request for item in items: yield item for request in self._process_link_regions(htmlpage, link_regions): yield request
def parse(self, response): self.crawled_all_pages += 1 # This condition is there because even if we stopped adding new requests, we might still have more requests # done in total than the self.url_limits # why? Because we stop when we reached sefl.urls_limit in terms of _crawled_ urls and not in terms of URLs # added to the queue. This allows us to ensure we always crawl _at least_ self.urls_limit URLs but in return # we will most likely always crawl more than self.urls_limit because we will likely add new URLs before some # URLs in the queue (the queue having already reached the limit) have been fetched if self.done(): return if (self.crawled_pages % self.PRINT_STATS_EVERY_X_CRAWLED_PAGES) is 0: delta = time()-self.start_time print "Current crawl speed: ", self.crawled_pages, "urls crawled,", delta, "seconds,", self.crawled_pages / delta, "pages/second" if self.links_rule_targeted.link_extractor.matches(response.url): print "page targeted", response.url self.crawled_pages += 1 html_p = htmlpage_from_response(response) scraped_result = self.scraper.scrape_page(html_p) score = scraped_result[0]['score'][0] if self.score_field_text_negative_matches: for to_strip_off in self.score_field_text_negative_matches: score = score.replace(to_strip_off, '') print "\n===============================" * 2 print "score=", score print "\n===============================" * 2 item = ( response.url, score, int(time()) ) self.save_to_db(item) if self.done(): # wasting a little bit resources here because of ">" instead of ">=" return # We do not scrap the links, this time unique_new_links = set( [ l for l in self.links_rule.link_extractor.extract_links(response) if len(l.url) <= 255 and TrendingSpider.extract_domain(l.url) == self.our_domain ]) - self.urls_seen print "Got", len(unique_new_links), "new links" self.urls_seen |= unique_new_links return [Request(link.url) for link in unique_new_links]
def test_start_urls(self): specs = {"type": "pagination", "value": None, "start_urls": ['http://www.spam.com/?p=1', 'http://www.eggs.com/?page=0'] } lextractor = create_linkextractor_from_specs(specs) html = """ <a href="http://www.spam.com/?p=100">Click here 1</a> <a href="http://www.spam.com/?p=200">Click here 2</a> <a href="http://www.spam.com/?p=300">Click here 3</a> """ html_page = htmlpage_from_response( HtmlResponse(url='http://www.example.com/', body=html)) links = list(lextractor.links_to_follow(html_page)) links = sorted(links, key=lambda link: link.url) self.assertEqual(len(links), 3) self.assertEqual(links[0].url, "http://www.spam.com/?p=100") self.assertEqual(links[1].url, "http://www.spam.com/?p=200") self.assertEqual(links[2].url, "http://www.spam.com/?p=300") self.assertEqual(links[0].text, 'Click here 1') self.assertEqual(links[1].text, 'Click here 2') self.assertEqual(links[2].text, 'Click here 3')
def extract_html(response): """Extracts an html page from the response. """ return htmlpage_from_response(response).body