def get_offers(self, response): etree = get_etree(response) base_response = response.meta['item']['_response'] for offer in etree.xpath(offer_xpath['row']): item = response.meta['item'].copy() item['seller'] = offer.xpath(offer_xpath['seller']) item['price'] = offer.xpath(offer_xpath['price']) item['stars'] = offer.xpath(offer_xpath['stars']) if offer.xpath(offer_xpath['form']): yield clone_from_response( FormRequest.from_response( base_response, formid="addToCart", formdata={ 'quantity': self.max_quantity, 'offerListingID': offer.xpath(offer_xpath['offering_id']) }, callback=self.add_product), response, {'_loader': SpiderLoader(item=OfferItem())}) else: logger.info('No add button for %s', response.url) yield load_from_dict(SpiderLoader(item=OfferItem()), item) next_page = etree.xpath(offer_xpath['next_page']) if next_page: yield clone_from_response( Request(next_page, self.get_offers), response, {'_response': base_response}, )
def add_product(self, response): retries = response.meta.get('_add_product_retries', 0) item = response.meta['item'] if retries: logger.info('Retry %s for %s', retries, item['url']) loader = item['_loader'] etree = get_etree(response) try: item['quantity'] = re.findall( r' \((\d+) items?\)', etree.xpath(product_xpath['quantity'])) quantity_value = int(item['quantity'][0]) except Exception: logger.exception("Can't parse quantity for %s", item['asin']) else: if not quantity_value: logger.warning( 'Zero quantity found for %s, retry %s, payload:\n %s', item['url'], retries, pprint.pformat( dict(urlparse.parse_qsl(response.request.body)))) if retries != self.max_add_product_retries: request = clone_from_response( Request(self.url_template.format(item['asin']), headers={'Referer': self.default_referer}, dont_filter=True), response, ) request.meta['_add_product_retries'] = retries + 1 self.jar_cnt += 1 request.meta['cookiejar'] = self.jar_cnt return request if response.meta['item']['_previous_quantity'][0]: # Existed item response.meta['item']['_previous_quantity'].append( quantity_value) if response.meta['item']['_previous_quantity'][ -1] != response.meta['item']['_previous_quantity'][-2]: # quantity changed or we got different results in 2 last requests logger.info('Quantity changes for %s : %s', response.meta['item']['asin'], response.meta['item']['_previous_quantity']) request = response.request.copy() request.meta['item']['_previous_quantity'] = response.meta[ 'item']['_previous_quantity'] self.jar_cnt += 1 request.meta['cookiejar'] = self.jar_cnt return request # Try again elif len(response.meta['item']['_previous_quantity']) > 2: logger.debug('Quantity changes for %s : %s', response.meta['item']['asin'], response.meta['item']['_previous_quantity']) if self.save_html: with open( 'html/{}_{}.html'.format(item['asin'] + str(), self.timestamp), 'w') as f: f.write(response.body) return load_from_dict(loader, item)
def parse(self, response): etree = get_etree(response) item = response.meta['item'] item['asin'] = re.findall(r'/dp/(\w+)', response.url)[0] item['url'] = response.url for k, xpath in item_xpath.items(): item[k] = etree.xpath(xpath) if etree.xpath(site_xpath['add_to_card']): formdata = { field.name: field.value for field in etree.xpath(".//form[@id='addToCart']/input") } formdata.update({'quantity': self.max_quantity}) yield clone_from_response( FormRequest.from_response( response, formid="addToCart", formdata=formdata, callback=self.add_product, dont_filter=True, ), response, {'_loader': SpiderLoader()}) url = etree.xpath(site_xpath['link']) if url and self.scrape_offers: yield clone_from_response(Request(url, self.get_offers), response, {'_response': response}) else: logger.info('No add button for %s', response.url) yield load_from_dict(SpiderLoader(), item) if self.scrape_variations: asins = {asin.split(',')[-1] for asin in etree.xpath(site_xpath['drop_down_asins'])} \ | {asin.split('/')[2] for asin in etree.xpath(site_xpath['twister_asins'])} \ | set(etree.xpath(site_xpath['link_asins'])) \ - self.asins_done for asin in asins: yield clone_from_response( Request(self.url_template.format(asin)), response) self.asins_done.add(asin)
def parse(self, response): etree = get_etree(response) if etree.xpath(".//*[@id='noResultsTitle']"): logger.info('No results for %s', response.url) return item = response.meta['item'] if 'results_count' not in item: text = etree.xpath(xpath['results_count']) try: if 'over' in text: raise ValueError numbers = re.findall('\d+', text.replace(',', '')) item['results_count'] = numbers[-1] except (IndexError, ValueError): logger.info('No real results "%s" for "%s" "%s"', text, item['asin'], item['keyword']) rq = response.request.copy() self.cookiejar += 1 rq.meta['cookiejar'] = self.cookiejar return rq search_rank = '0' for url in etree.xpath(xpath['links']): if item['asin'] in url: try: search_rank = re.findall(r'&sr=\d*?-(\d*)&', url)[0] except IndexError: pass break next_page = etree.xpath(xpath['pagination']) logger.debug('Next page: "%s"', next_page) try: more = bool( search_rank == '0' and next_page and not int(dict(urlparse.parse_qsl(next_page))['page']) == self.pages_limit) except KeyError: more = False if more: return clone_from_response( Request(next_page.split('&spIA=')[0], dont_filter=True), response, ) item['rank'] = search_rank try: item['page'] = int(dict(urlparse.parse_qsl(response.url))['page']) except KeyError: item['page'] = 1 return item # I see no reason for Items here
def process_captcha(url): image = yield spider.crawler.engine.download( Request(url, dont_filter=True), spider) answer = binarized(image.body) logger.debug('Captcha answer is: %s', answer) request = clone_from_response( FormRequest.from_response( response, formdata={'field-keywords': answer}, callback=response.request.callback, dont_filter=True, ), response, ) request.meta.update({ '_captcha_retries': retries + 1, '_captcha_img': image.body, '_captcha_answer': answer }) try: spider.crawler.engine.crawl(request, spider) except Exception as e: logger.error(e)