def parse_links(self, response): links = response.xpath('//tr[contains(@id, "reviewer")]') for link in links: url = link.xpath('./td[@class="img"]/a[1]/@href').extract() item = ReviewItem() rank = link.xpath('./td[@class="crNum"]/text()').re( '#\s?(\d+,?\d{0,})') if rank: rank = int(rank[0].replace(',', '')) item['rank'] = rank meta = {'item': item} yield Request(url[0], meta=meta, callback=self.parse_email) self.page += 1 if self.page <= 1000: yield Request(self.PAGINATE_URL.format(page=self.page), callback=self.parse_without_captcha)
def parse_links(self, response): links = response.xpath('//tr[contains(@id, "reviewer")]') for link in links: url = link.xpath('./td[@class="img"]/a[1]/@href').extract() item = ReviewItem() rank = link.xpath('./td[@class="crNum"]/text()').re( '#\s?(\d+,?\d{0,})') if rank: rank = int(rank[0].replace(',', '')) item['rank'] = rank meta = {'item': item} yield Request('http://www.amazon.co.uk' + url[0], meta=meta, callback=self.parse_email) next_page = response.xpath( '//a[contains(text(),"Next")]/@href').extract() if next_page: yield Request(next_page[0], callback=self.parse_without_captcha)