Exemple #1
0
    def parse_links(self, response):
        links = response.xpath('//tr[contains(@id, "reviewer")]')
        for link in links:
            url = link.xpath('./td[@class="img"]/a[1]/@href').extract()
            item = ReviewItem()

            rank = link.xpath('./td[@class="crNum"]/text()').re(
                '#\s?(\d+,?\d{0,})')

            if rank:
                rank = int(rank[0].replace(',', ''))
                item['rank'] = rank
                meta = {'item': item}
                yield Request(url[0], meta=meta, callback=self.parse_email)
        self.page += 1
        if self.page <= 1000:
            yield Request(self.PAGINATE_URL.format(page=self.page),
                          callback=self.parse_without_captcha)
Exemple #2
0
    def parse_links(self, response):
        links = response.xpath('//tr[contains(@id, "reviewer")]')
        for link in links:
            url = link.xpath('./td[@class="img"]/a[1]/@href').extract()
            item = ReviewItem()

            rank = link.xpath('./td[@class="crNum"]/text()').re(
                '#\s?(\d+,?\d{0,})')

            if rank:
                rank = int(rank[0].replace(',', ''))
                item['rank'] = rank
                meta = {'item': item}
                yield Request('http://www.amazon.co.uk' + url[0],
                              meta=meta,
                              callback=self.parse_email)

        next_page = response.xpath(
            '//a[contains(text(),"Next")]/@href').extract()
        if next_page:
            yield Request(next_page[0], callback=self.parse_without_captcha)