Beispiel #1
0
 def handle_html(self, response):
     htmlpage = htmlpage_from_response(response)
     items, link_regions = self.extract_items(htmlpage)
     for item in items:
         yield item
     for request in self._process_link_regions(htmlpage, link_regions):
         yield request
Beispiel #2
0
 def handle_html(self, response):
     htmlpage = htmlpage_from_response(response)
     items, link_regions = self.extract_items(htmlpage)
     for item in items:
         yield item
     for request in self._process_link_regions(htmlpage, link_regions):
         yield request
Beispiel #3
0
 def test_start_urls(self):
     specs = {
         "type":
         "pagination",
         "value":
         None,
         "start_urls":
         ['http://www.spam.com/?p=1', 'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         UTF8HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')
Beispiel #4
0
    def _extract_links(self, response_or_htmlpage):
        """Extract links to follow from an html page

        This uses `iterlinks` to read the links in the page.
        """
        htmlpage = htmlpage_from_response(response_or_htmlpage) if \
                    isinstance(response_or_htmlpage, HtmlResponse) else response_or_htmlpage
        return iterlinks(htmlpage)
Beispiel #5
0
    def _extract_links(self, response_or_htmlpage):
        """Extract links to follow from an html page

        This uses `iterlinks` to read the links in the page.
        """
        if isinstance(response_or_htmlpage, HtmlResponse):
            response_or_htmlpage = htmlpage_from_response(response_or_htmlpage)
        return iterlinks(response_or_htmlpage)
Beispiel #6
0
 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(HtmlResponse(url="http://www.example.com/", body=html))
     html_page.headers["n_items"] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, "http://www.example.com/path")
     self.assertEqual(links[0].text, "Click here")
Beispiel #7
0
 def test_simple(self):
     specs = {"type": "pagination", "value": None}
     lextractor = create_linkextractor_from_specs(specs)
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     html_page.headers['n_items'] = 1
     links = list(lextractor.links_to_follow(html_page))
     self.assertEqual(len(links), 1)
     self.assertEqual(links[0].url, 'http://www.example.com/path')
     self.assertEqual(links[0].text, 'Click here')
Beispiel #8
0
 def handle_html(self, response, seen=None):
     htmlpage = htmlpage_from_response(response)
     items, link_regions = self.extract_items(htmlpage)
     htmlpage.headers['n_items'] = len(items)
     try:
         response.meta['n_items'] = len(items)
     except AttributeError:
         pass  # response not tied to any request
     for item in items:
         yield item
     for request in self._process_link_regions(htmlpage, link_regions):
         yield request
Beispiel #9
0
 def handle_html(self, response, seen=None):
     htmlpage = htmlpage_from_response(response)
     items, link_regions = self.extract_items(htmlpage)
     htmlpage.headers['n_items'] = len(items)
     try:
         response.meta['n_items'] = len(items)
     except AttributeError:
         pass # response not tied to any request
     for item in items:
         yield item
     for request in self._process_link_regions(htmlpage, link_regions):
         yield request
    def parse(self, response):
        self.crawled_all_pages += 1
        # This condition is there because even if we stopped adding new requests, we might still have more requests 
        # done in total than the self.url_limits
        # why? Because we stop when we reached sefl.urls_limit in terms of _crawled_ urls and not in terms of URLs 
        # added to the queue. This allows us to ensure we always crawl _at least_ self.urls_limit URLs but in return
        # we will most likely always crawl more than self.urls_limit because we will likely add new URLs before some
        # URLs in the queue (the queue having already reached the limit) have been fetched
        if self.done():
            return
        if (self.crawled_pages % self.PRINT_STATS_EVERY_X_CRAWLED_PAGES) is 0:  
            delta = time()-self.start_time
            print "Current crawl speed: ", self.crawled_pages, "urls crawled,", delta, "seconds,", self.crawled_pages / delta, "pages/second"
        if self.links_rule_targeted.link_extractor.matches(response.url):
            print "page targeted", response.url
            self.crawled_pages += 1
            html_p = htmlpage_from_response(response)
            scraped_result = self.scraper.scrape_page(html_p)
            score = scraped_result[0]['score'][0]
            if self.score_field_text_negative_matches:
                for to_strip_off in self.score_field_text_negative_matches:
                    score = score.replace(to_strip_off, '')
            print "\n===============================" * 2
            print "score=", score
            print "\n===============================" * 2
            item = (
                response.url,
                score,
                int(time())
            )
            self.save_to_db(item)
        if self.done(): # wasting a little bit resources here because of ">" instead of ">="
            return  # We do not scrap the links, this time
        unique_new_links = set(
            [
                l for l in self.links_rule.link_extractor.extract_links(response) 
                if len(l.url) <= 255 and TrendingSpider.extract_domain(l.url) == self.our_domain
            ]) - self.urls_seen

        print "Got", len(unique_new_links), "new links"
        self.urls_seen |= unique_new_links
        return [Request(link.url) for link in unique_new_links]
Beispiel #11
0
 def test_start_urls(self):
     specs = {"type": "pagination",
              "value": None,
              "start_urls": ['http://www.spam.com/?p=1',
                             'http://www.eggs.com/?page=0']
     }
     lextractor = create_linkextractor_from_specs(specs)
     html = """
     <a href="http://www.spam.com/?p=100">Click here 1</a>
     <a href="http://www.spam.com/?p=200">Click here 2</a>
     <a href="http://www.spam.com/?p=300">Click here 3</a>
     """
     html_page = htmlpage_from_response(
         HtmlResponse(url='http://www.example.com/', body=html))
     links = list(lextractor.links_to_follow(html_page))
     links = sorted(links, key=lambda link: link.url)
     self.assertEqual(len(links), 3)
     self.assertEqual(links[0].url, "http://www.spam.com/?p=100")
     self.assertEqual(links[1].url, "http://www.spam.com/?p=200")
     self.assertEqual(links[2].url, "http://www.spam.com/?p=300")
     self.assertEqual(links[0].text, 'Click here 1')
     self.assertEqual(links[1].text, 'Click here 2')
     self.assertEqual(links[2].text, 'Click here 3')
Beispiel #12
0
def extract_html(response):
    """Extracts an html page from the response.
    """
    return htmlpage_from_response(response).body
Beispiel #13
0
def extract_html(response):
    """Extracts an html page from the response.
    """
    return htmlpage_from_response(response).body