예제 #1
0
 def get_offers(self, response):
     etree = get_etree(response)
     base_response = response.meta['item']['_response']
     for offer in etree.xpath(offer_xpath['row']):
         item = response.meta['item'].copy()
         item['seller'] = offer.xpath(offer_xpath['seller'])
         item['price'] = offer.xpath(offer_xpath['price'])
         item['stars'] = offer.xpath(offer_xpath['stars'])
         if offer.xpath(offer_xpath['form']):
             yield clone_from_response(
                 FormRequest.from_response(
                     base_response,
                     formid="addToCart",
                     formdata={
                         'quantity':
                         self.max_quantity,
                         'offerListingID':
                         offer.xpath(offer_xpath['offering_id'])
                     },
                     callback=self.add_product), response,
                 {'_loader': SpiderLoader(item=OfferItem())})
         else:
             logger.info('No add button for %s', response.url)
             yield load_from_dict(SpiderLoader(item=OfferItem()), item)
         next_page = etree.xpath(offer_xpath['next_page'])
         if next_page:
             yield clone_from_response(
                 Request(next_page, self.get_offers),
                 response,
                 {'_response': base_response},
             )
예제 #2
0
 def add_product(self, response):
     retries = response.meta.get('_add_product_retries', 0)
     item = response.meta['item']
     if retries:
         logger.info('Retry %s for %s', retries, item['url'])
     loader = item['_loader']
     etree = get_etree(response)
     try:
         item['quantity'] = re.findall(
             r' \((\d+) items?\)', etree.xpath(product_xpath['quantity']))
         quantity_value = int(item['quantity'][0])
     except Exception:
         logger.exception("Can't parse quantity for %s", item['asin'])
     else:
         if not quantity_value:
             logger.warning(
                 'Zero quantity found for %s, retry %s, payload:\n %s',
                 item['url'], retries,
                 pprint.pformat(
                     dict(urlparse.parse_qsl(response.request.body))))
             if retries != self.max_add_product_retries:
                 request = clone_from_response(
                     Request(self.url_template.format(item['asin']),
                             headers={'Referer': self.default_referer},
                             dont_filter=True),
                     response,
                 )
                 request.meta['_add_product_retries'] = retries + 1
                 self.jar_cnt += 1
                 request.meta['cookiejar'] = self.jar_cnt
                 return request
         if response.meta['item']['_previous_quantity'][0]:  # Existed item
             response.meta['item']['_previous_quantity'].append(
                 quantity_value)
             if response.meta['item']['_previous_quantity'][
                     -1] != response.meta['item']['_previous_quantity'][-2]:
                 # quantity changed or we got different results in 2 last requests
                 logger.info('Quantity changes for %s : %s',
                             response.meta['item']['asin'],
                             response.meta['item']['_previous_quantity'])
                 request = response.request.copy()
                 request.meta['item']['_previous_quantity'] = response.meta[
                     'item']['_previous_quantity']
                 self.jar_cnt += 1
                 request.meta['cookiejar'] = self.jar_cnt
                 return request  # Try again
             elif len(response.meta['item']['_previous_quantity']) > 2:
                 logger.debug('Quantity changes for %s : %s',
                              response.meta['item']['asin'],
                              response.meta['item']['_previous_quantity'])
         if self.save_html:
             with open(
                     'html/{}_{}.html'.format(item['asin'] + str(),
                                              self.timestamp), 'w') as f:
                 f.write(response.body)
         return load_from_dict(loader, item)
예제 #3
0
 def parse(self, response):
     etree = get_etree(response)
     item = response.meta['item']
     item['asin'] = re.findall(r'/dp/(\w+)', response.url)[0]
     item['url'] = response.url
     for k, xpath in item_xpath.items():
         item[k] = etree.xpath(xpath)
     if etree.xpath(site_xpath['add_to_card']):
         formdata = {
             field.name: field.value
             for field in etree.xpath(".//form[@id='addToCart']/input")
         }
         formdata.update({'quantity': self.max_quantity})
         yield clone_from_response(
             FormRequest.from_response(
                 response,
                 formid="addToCart",
                 formdata=formdata,
                 callback=self.add_product,
                 dont_filter=True,
             ), response, {'_loader': SpiderLoader()})
         url = etree.xpath(site_xpath['link'])
         if url and self.scrape_offers:
             yield clone_from_response(Request(url, self.get_offers),
                                       response, {'_response': response})
     else:
         logger.info('No add button for %s', response.url)
         yield load_from_dict(SpiderLoader(), item)
     if self.scrape_variations:
         asins = {asin.split(',')[-1] for asin in etree.xpath(site_xpath['drop_down_asins'])} \
                 | {asin.split('/')[2] for asin in etree.xpath(site_xpath['twister_asins'])} \
                 | set(etree.xpath(site_xpath['link_asins'])) \
                 - self.asins_done
         for asin in asins:
             yield clone_from_response(
                 Request(self.url_template.format(asin)), response)
             self.asins_done.add(asin)
예제 #4
0
 def parse(self, response):
     etree = get_etree(response)
     if etree.xpath(".//*[@id='noResultsTitle']"):
         logger.info('No results for %s', response.url)
         return
     item = response.meta['item']
     if 'results_count' not in item:
         text = etree.xpath(xpath['results_count'])
         try:
             if 'over' in text:
                 raise ValueError
             numbers = re.findall('\d+', text.replace(',', ''))
             item['results_count'] = numbers[-1]
         except (IndexError, ValueError):
             logger.info('No real results "%s" for "%s" "%s"', text,
                         item['asin'], item['keyword'])
             rq = response.request.copy()
             self.cookiejar += 1
             rq.meta['cookiejar'] = self.cookiejar
             return rq
     search_rank = '0'
     for url in etree.xpath(xpath['links']):
         if item['asin'] in url:
             try:
                 search_rank = re.findall(r'&sr=\d*?-(\d*)&', url)[0]
             except IndexError:
                 pass
             break
     next_page = etree.xpath(xpath['pagination'])
     logger.debug('Next page: "%s"', next_page)
     try:
         more = bool(
             search_rank == '0' and next_page
             and not int(dict(urlparse.parse_qsl(next_page))['page'])
             == self.pages_limit)
     except KeyError:
         more = False
     if more:
         return clone_from_response(
             Request(next_page.split('&spIA=')[0], dont_filter=True),
             response,
         )
     item['rank'] = search_rank
     try:
         item['page'] = int(dict(urlparse.parse_qsl(response.url))['page'])
     except KeyError:
         item['page'] = 1
     return item  # I see no reason for Items here
예제 #5
0
 def process_captcha(url):
     image = yield spider.crawler.engine.download(
         Request(url, dont_filter=True), spider)
     answer = binarized(image.body)
     logger.debug('Captcha answer is: %s', answer)
     request = clone_from_response(
         FormRequest.from_response(
             response,
             formdata={'field-keywords': answer},
             callback=response.request.callback,
             dont_filter=True,
         ),
         response,
     )
     request.meta.update({
         '_captcha_retries': retries + 1,
         '_captcha_img': image.body,
         '_captcha_answer': answer
     })
     try:
         spider.crawler.engine.crawl(request, spider)
     except Exception as e:
         logger.error(e)