def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonProductsSpider, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self.mtp_class = Amazon_marketplace(self) self._cbw = CaptchaBreakerWrapper()
def __init__(self, outfile=None, test_category=None): self.outfile = outfile # if this is set, only crawl this category (level 2/1 category name). used for testing self.test_category = test_category # if test category is set and no output file was specified, set the name of outfile to a special "test" name if self.test_category and not self.outfile: self.outfile = "amazon_categories_test.jl" # level that is considered to contain departments self.DEPARTMENT_LEVEL = 2 # hardcoded toplevel categories (level 1 and 2) urls to replace/supplement some of the ones found on the sitemap above (point to the same category, but have different page content. they were found manually) # reason: they provide more info regarding product count than the ones found on the sitemap # keys are categories names as found in the sitemap, values are URLs associated with them, that will replace/supplement the links found on the sitemap self.EXTRA_TOPLEVEL_CATEGORIES_URLS = { "Baby" : "http://www.amazon.com/s/ref=lp_166835011_ex_n_1?rh=n%3A165796011&bbn=165796011&ie=UTF8&qid=1393338541", \ "Electronics & Computers" : "http://www.amazon.com/s/ref=lp_172659_ex_n_1?rh=n%3A172282&bbn=172282&ie=UTF8&qid=1393338741", \ "Home, Garden & Tools" : "http://www.amazon.com/s/ref=lp_284507_ex_n_1?rh=n%3A1055398&bbn=1055398&ie=UTF8&qid=1393338782",\ "Kindle E-readers & Books" : "http://www.amazon.com/s/ref=lp_154606011_ex_n_1?rh=n%3A133140011&bbn=133140011&ie=UTF8&qid=1395704970", \ "Apps & Games" : "http://www.amazon.com/b/ref=sd_allcat_fire_apps_games?ie=UTF8&node=3427287011", \ "Movies & TV" : "http://www.amazon.com/action-adventure-dvd-bluray/b/ref=MoviesHPBB_Genres_Action?ie=UTF8&node=2650363011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-left-2&pf_rd_r=0GAWFEZ3EXP8PEYCM6X3&pf_rd_t=101&pf_rd_p=1753817742&pf_rd_i=2625373011", \ "All Beauty" : "http://www.amazon.com/s/ref=lp_11059031_ex_n_1?rh=n%3A3760911&bbn=3760911&ie=UTF8&qid=1395793680",\ "Health, Household & Baby Care" : "http://www.amazon.com/s/ref=lp_6183682011_ex_n_1?rh=n%3A3760901&bbn=3760901&ie=UTF8&qid=1395822180", \ "Tires & Wheels" : "http://www.amazon.com/s/ref=lp_353609011_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A15706571&bbn=15706571&ie=UTF8&qid=1395824546", \ "Motorcycle & Powersports" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A346333011&bbn=346333011&ie=UTF8&qid=1395824599", \ "Automotive & Industrial" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181&bbn=15684181" # this is partial - "Automotive and industrial" also contains the "Industrial & Scientific" cats which can be found in the sitemap } # flag indicating whether to compute overall product counts in pipelines phase for this spider. # if on, 'catid' and 'parent_catid' fields need to be implemented self.compute_nrproducts = True # counter for department id, will be used to autoincrement department id self.department_count = 0 # counter for category id self.catid = 0 # level to stop crawling (don't extract subcategories below this level) self.LEVEL_BARRIER = -2 # maximum number of retries when presented with captcha form self.MAX_CAPTCHA_RETRY = 10 # dictionarties associating department names with other attributes - to use for setting parent category info for level 1 categories # associates department names with their ids self.departments_ids = {} # associates department names with their urls (will be available only for extra_categories) self.department_urls = {} # associate department names with their category ids self.departments_cat_ids = {} # captcha breaker self.CB = CaptchaBreakerWrapper()
def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonBaseClass, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper()
def __init__(self, limit='100', service_url=None, captcha_retries='10', *args, **kwargs): super(UrlServiceSpider, self).__init__(*args, **kwargs) if service_url is None: raise AssertionError("Service URL is not optional.") self.limit = limit self.captcha_retries = int(captcha_retries) self.service_url = service_url self._cbw = CaptchaBreakerWrapper() queue_url = urlparse.urljoin( self.service_url, 'get_queued_urls/?limit=%d&block=%d') \ % (int(limit), 0) self.log("Fetching URLs with '%s'." % queue_url, level=DEBUG) self.start_urls.append(queue_url)
def __init__(self, url_formatter=None, client_url=None, file_name=None, product_asins=None, captcha_retries='10', *args, **kwargs): self.SEARCH_URL = client_url super(AmazonSpider, self).__init__(*args, **kwargs) if file_name: self.file_name = file_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter product_asins = json.loads(product_asins) self.product_asins = product_asins['asins'] self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper()
def __init__(self, *args, **kwargs): # For some reason amazon fail to scrape most data # when you turn off variants self.ignore_variant_data = False self.product_url = kwargs['product_url'] # See https://bugzilla.contentanalyticsinc.com/show_bug.cgi?id=3313#c0 self.num_pages = int(kwargs.get('num_pages', 1)) # # variants are switched off by default, see Bugzilla 3982#c11 # self.scrape_variants_with_extra_requests = False # if 'scrape_variants_with_extra_requests' in kwargs: # scrape_variants_with_extra_requests = \ # kwargs['scrape_variants_with_extra_requests'] # if scrape_variants_with_extra_requests in \ # (1, '1', 'true', 'True', True): # self.scrape_variants_with_extra_requests = True # Default price currency self.price_currency = 'USD' self.price_currency_view = '$' # Locale self.locale = 'en-US' self.mtp_class = Amazon_marketplace(self) self._cbw = CaptchaBreakerWrapper() # #backup when total matches cannot be scraped # self.total_items_scraped = 0 # # self.ranking_override = 0 self.total_matches_re = r'of\s([\d\,]+)\s' super(AmazonShelfPagesSpider, self).__init__(*args, **kwargs) self._setup_class_compatibility() # self.remaining = self.quantity # For goldbox deals self.deal_response_json_list = [] self.deal_product_url_list = [] self.sorted_goldbox_deals_ids = []
class AmazonSpider(Spider): name = 'amazon' allowed_domains = ["amazon.com"] start_urls = [] handle_httpstatus_list = [404] MAX_RETRIES = 3 user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko' '/20100101 Firefox/35.0') page = 1 def __init__(self, url_formatter=None, client_url=None, file_name=None, product_asins=None, captcha_retries='10', *args, **kwargs): self.SEARCH_URL = client_url super(AmazonSpider, self).__init__(*args, **kwargs) if file_name: self.file_name = file_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter product_asins = json.loads(product_asins) self.product_asins = product_asins['asins'] self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def make_requests_from_url(self, _): """This method does not apply to this type of spider so it is overriden and "disabled" by making it raise an exception unconditionally. """ raise AssertionError("Need a search term.") def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" meta = {'asins': self.product_asins} yield Request(self.SEARCH_URL, meta=meta) def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = self.parse_without_captcha(response) return result def parse_without_captcha(self, response): item = AmazonspiderItem() if response.status == 404: item['error_message'] = '404 Invalid URL' return item item['client_url'] = response.url meta = response.meta.copy() meta['item'] = item reviews_url = response.xpath( '//a[@class="a-link-normal"]/span[contains(text(),' ' "Reviews")]/../@href[contains(.,"member-reviews")]' ).extract() if reviews_url: reviews_url = 'http://www.amazon.com' + reviews_url[0] return Request(reviews_url, meta=meta, callback=self.parse_reviews) else: item['error_message'] = 'Amazon blocked, try again' return item def parse_reviews(self, response): print 'PARSE REVIEWS' products_asins = response.meta.get('asins') item = response.meta.get('item') review_asins = response.xpath( '//table[@class="small"]/tr/td/b/a/@href').re('dp/(.*)/ref') find_asins = [] for asin in review_asins: if asin in products_asins: find_asins.append(asin) if 'asins' not in item.keys(): item['asins'] = find_asins else: item['asins'].extend(find_asins) self.page += 1 next_page_url = response.xpath( '//td[@class="small"]/b/a[contains(@href,"page=' +str(self.page) + '")]/@href').extract() if next_page_url: next_page_url = 'http://www.amazon.com' + next_page_url[0] meta = response.meta.copy() meta['item'] = item yield Request(next_page_url, meta=meta, callback=self.parse_reviews) else: yield item # Captcha handling functions. def _has_captcha(self, response): return '.images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( url, captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, url), level=INFO ) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonProductsSpider, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper()
class AmazonProductsSpider(BaseProductsSpider): name = 'amazon_products' allowed_domains = ["amazon.com"] SEARCH_URL = "http://www.amazon.com/s/?field-keywords={search_term}" def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonProductsSpider, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = super(AmazonProductsSpider, self).parse(response) return result def parse_product(self, response): prod = response.meta['product'] if not self._has_captcha(response): self._populate_from_js(response, prod) self._populate_from_html(response, prod) cond_set_value(prod, 'locale', 'en-US') # Default locale. result = prod elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries: self.log("Giving up on trying to solve the captcha challenge after" " %s tries for: %s" % (self.captcha_retries, prod['url']), level=WARNING) result = None else: result = self._handle_captcha(response, self.parse_product) return result def _populate_from_html(self, response, product): cond_set(product, 'brand', response.css('#brand ::text').extract()) cond_set( product, 'price', response.css('#priceblock_ourprice ::text').extract(), ) cond_set( product, 'description', response.css('.productDescriptionWrapper').extract(), ) cond_set( product, 'image_url', response.css( '#imgTagWrapperId > img ::attr(data-old-hires)').extract() ) cond_set( product, 'title', response.css('#productTitle ::text').extract()) # Some data is in a list (ul element). model = None for li in response.css('td.bucket > .content > ul > li'): raw_keys = li.xpath('b/text()').extract() if not raw_keys: # This is something else, ignore. continue key = raw_keys[0].strip(' :').upper() if key == 'UPC': # Some products have several UPCs. The first one is used. raw_upc = li.xpath('text()').extract()[0] cond_set( product, 'upc', raw_upc.strip().split(' '), conv=int ) elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER': model = li.xpath('text()').extract() cond_set(product, 'model', model, conv=string.strip) def _populate_from_js(self, response, product): # Images are not always on the same spot... img_jsons = response.css( '#landingImage ::attr(data-a-dynamic-image)').extract() if img_jsons: img_data = json.loads(img_jsons[0]) cond_set_value( product, 'image_url', max(img_data.items(), key=lambda (_, size): size[0]), conv=lambda (url, _): url) def _scrape_total_matches(self, response): # Where this value appears is a little weird and changes a bit so we # need two alternatives to capture it consistently. if response.css('#noResultsTitle'): return 0 # The first possible place is where it normally is in a fully rendered # page. values = response.css('#resultCount > span ::text').re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults') if not values: # Otherwise, it appears within a comment. values = response.css( '#result-count-only-next' ).xpath( 'comment()' ).re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+' ) if values: total_matches = int(values[0].replace(',', '')) else: self.log( "Failed to parse total number of matches for: %s" % response.url, level=ERROR ) total_matches = None return total_matches def _scrape_product_links(self, response): links = response.css('.prod > h3 > a ::attr(href)').extract() if not links: self.log("Found no product links.", WARNING) for link in links: yield link, SiteProductItem() def _scrape_next_results_page_link(self, response): next_pages = response.css('#pagnNextLink ::attr(href)').extract() next_page_url = None if len(next_pages) == 1: next_page_url = next_pages[0] elif len(next_pages) > 1: self.log("Found more than one 'next page' link.", ERROR) return next_page_url ## Captcha handling functions. def _has_captcha(self, response): return '.images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): # FIXME This is untested and wrong. captcha_solve_try = response.meta.get('captcha_solve_try', 0) product = response.meta['product'] self.log("Captcha challenge for %s (try %d)." % (product['url'], captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( product['url'], captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, product['url']), level=INFO ) result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback) result.meta['captcha_solve_try'] = captcha_solve_try + 1 result.meta['product'] = product return result
def __init__(self, url_formatter=None, quantity=None, page=None, searchterms_str='laptop', searchterms_fn=None, site_name=None, product_url=None, user_agent=None, captcha_retries='10', *args, **kwargs): if user_agent is None or user_agent not in self.USER_AGENTS.keys(): self.log( "Not available user agent type or it wasn't set." " Default user agent will be used.", INFO) user_agent = 'default' if user_agent: self.user_agent = self.USER_AGENTS[user_agent] self.user_agent_key = user_agent super(AmazonSpider, self).__init__(*args, **kwargs) if site_name is None: assert len(self.allowed_domains) == 1, \ "A single allowed domain is required to auto-detect site name." self.site_name = self.allowed_domains[0] else: self.site_name = site_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter if quantity is None: self.log("No quantity specified. Will retrieve all products.", INFO) import sys self.quantity = sys.maxint else: self.quantity = int(quantity) if page is None: self.log("No page specified. Will retrieve all products.", INFO) import sys self.page = sys.maxint else: self.page = int(page) self.product_url = product_url self.searchterms = [] if searchterms_str is not None: self.searchterms = searchterms_str.decode('utf-8').split(',') elif searchterms_fn is not None: with open(searchterms_fn, encoding='utf-8') as f: self.searchterms = f.readlines() else: self.log("No search terms provided!", ERROR) self.log( "Created for %s with %d search terms." % (self.site_name, len(self.searchterms)), INFO) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper()
class UrlServiceSpider(Spider): name = "url_service" allowed_domains = [] start_urls = [] def __init__(self, limit='100', service_url=None, captcha_retries='10', *args, **kwargs): super(UrlServiceSpider, self).__init__(*args, **kwargs) if service_url is None: raise AssertionError("Service URL is not optional.") self.limit = limit self.captcha_retries = int(captcha_retries) self.service_url = service_url self._cbw = CaptchaBreakerWrapper() queue_url = urlparse.urljoin( self.service_url, 'get_queued_urls/?limit=%d&block=%d') \ % (int(limit), 0) self.log("Fetching URLs with '%s'." % queue_url, level=DEBUG) self.start_urls.append(queue_url) def parse(self, response): for crawl_data in json.loads(response.body): self.log("From URL Service: %s" % crawl_data, DEBUG) url = crawl_data['url'] req = Request(url, callback=self.parse_target, errback=self.parse_target_err) req.meta['crawl_data'] = crawl_data req.meta['start_time'] = time.clock() yield req def parse_target(self, response): if not self._has_captcha(response.body): result = self._parse_target(response) elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries: # We already tried to solve the captcha, give up. result = RequestErrorItem(base_url=self.service_url, id=response.meta['crawl_data']['id'], http_code=response.status, error_string="Failed to solve captcha.") else: result = self._handle_captcha(response) return result def _parse_target(self, response): crawl_data = response.meta['crawl_data'] body = None if hasattr(response, 'body_as_unicode'): body = response.body_as_unicode().encode('utf-8') else: body = response.body # Probably binary or incorrect Content-Type. item = PageItem(base_url=self.service_url, total_time=time.clock() - response.meta['start_time'], id=crawl_data['id'], url=crawl_data['url'], imported_data_id=crawl_data['imported_data_id'], category_id=crawl_data['category_id'], body=body) return item def _handle_captcha(self, response): crawl_data = response.meta['crawl_data'] captch_solve_try = response.meta.get('captch_solve_try', 0) self.log("Captcha challenge for %s (try %d)." % (crawl_data.get('url'), captch_solve_try), level=INFO) forms = Selector(response).xpath('//form') assert len(forms) == 1, "More than one form found." hidden_value1 = forms[0].xpath( '//input[@name="amzn"]/@value').extract()[0] hidden_value2 = forms[0].xpath( '//input[@name="amzn-r"]/@value').extract()[0] captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha values: (%s) (%s) (%s)" % (hidden_value1, hidden_value2, captcha_img), level=DEBUG) captcha = self._solve_captcha(captcha_img) if captcha is None: err_msg = "Failed to guess captcha for '%s' (id: %s, try: %d)." % ( crawl_data.get('url'), crawl_data.get('id'), captch_solve_try) self.log(err_msg, level=ERROR) result = RequestErrorItem(base_url=self.service_url, id=crawl_data['id'], http_code=response.status, error_string=err_msg) else: self.log("Submitting captcha '%s' for '%s' (try %d)." % (captcha, captcha_img, captch_solve_try), level=INFO) result = FormRequest.from_response(response, formname='', formdata={ 'field-keywords': captcha, }, callback=self.parse_target, errback=self.parse_target_err) result.meta['captch_solve_try'] = captch_solve_try + 1 result.meta['crawl_data'] = response.meta['crawl_data'] result.meta['start_time'] = response.meta['start_time'] return result def parse_target_err(self, failure): url_id = failure.request.meta['crawl_data']['id'] error_string = failure.getErrorMessage() if isinstance(failure.value, HttpError): status = failure.value.response.status else: status = 0 self.log("Unhandled failure type '%s'. Will continue" % type(failure.value), level=ERROR) item = RequestErrorItem(base_url=self.service_url, id=url_id, http_code=status, error_string=error_string) return item def _has_captcha(self, body): return '.images-amazon.com/captcha/' in body def _solve_captcha(self, captcha_url): return self._cbw.solve_captcha(captcha_url)
class AmazonBaseClass(Spider): def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonBaseClass, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = self.parse_without_captcha(response) return result def parse_links(self, response): """ Handles parsing of a top reviewers page. :param response: :return: ReviewItem's with Rank """ raise NotImplementedError def parse_profile(self, response): """ Handles parsing of a reviewer profile page. :param response: :return: ReviewItem's with Email, Name and Country """ raise NotImplementedError def parse_without_captcha(self, response): if not self._has_captcha(response): res = self.parse_links(response) for i in res: yield i else: result = self._handle_captcha(response, self.parse_without_captcha) yield result def parse_email(self, response): if not self._has_captcha(response): result = self.parse_profile(response) if result: return result else: result = self._handle_captcha(response, self.parse_email) return result # Captcha handling functions. def _has_captcha(self, response): return 'images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log("Failed to guess captcha for '%s' (try: %d)." % (url, captcha_solve_try), level=ERROR) result = None else: self.log("On try %d, submitting captcha '%s' for '%s'." % (captcha_solve_try, captcha, url), level=INFO) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
def __init__(self, url_formatter=None, quantity=None, page=None, searchterms_str=None, searchterms_fn=None, site_name=None, product_url=None, user_agent=None, captcha_retries='10', *args, **kwargs): if user_agent is None or user_agent not in self.USER_AGENTS.keys(): self.log("Not available user agent type or it wasn't set." " Default user agent will be used.", INFO) user_agent = 'default' if user_agent: self.user_agent = self.USER_AGENTS[user_agent] self.user_agent_key = user_agent super(AmazonSpider, self).__init__(*args, **kwargs) if site_name is None: assert len(self.allowed_domains) == 1, \ "A single allowed domain is required to auto-detect site name." self.site_name = self.allowed_domains[0] else: self.site_name = site_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter if quantity is None: self.log("No quantity specified. Will retrieve all products.", INFO) import sys self.quantity = sys.maxint else: self.quantity = int(quantity) if page is None: self.log("No page specified. Will retrieve all products.", INFO) import sys self.page = sys.maxint else: self.page = int(page) self.product_url = product_url self.searchterms = [] if searchterms_str is not None: self.searchterms = searchterms_str.decode('utf-8').split(',') elif searchterms_fn is not None: with open(searchterms_fn, encoding='utf-8') as f: self.searchterms = f.readlines() else: self.log("No search terms provided!", ERROR) self.log("Created for %s with %d search terms." % (self.site_name, len(self.searchterms)), INFO) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper()
class AmazonSpider(Spider): name = 'amazon' allowed_domains = ["amazon.com"] start_urls = [] SEARCH_URL = 'http://www.amazon.com/s/ref=sr_as_oo?' \ 'rh=i%3Aaps%2Ck%3A{search_term}&keywords={search_term}' MAX_RETRIES = 3 user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko' '/20100101 Firefox/35.0') USER_AGENTS = { 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 'Gecko/20100101 Firefox/35.0', 'desktop': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 'Gecko/20100101 Firefox/35.0', 'iphone_ipad': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_6 '\ 'like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) '\ 'Version/7.0 Mobile/11B651 Safari/9537.53', 'android_phone': 'Mozilla/5.0 (Android; Mobile; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', 'android_pad': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', 'android': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', } def __init__(self, url_formatter=None, quantity=None, page=None, searchterms_str=None, searchterms_fn=None, site_name=None, product_url=None, user_agent=None, captcha_retries='10', *args, **kwargs): if user_agent is None or user_agent not in self.USER_AGENTS.keys(): self.log("Not available user agent type or it wasn't set." " Default user agent will be used.", INFO) user_agent = 'default' if user_agent: self.user_agent = self.USER_AGENTS[user_agent] self.user_agent_key = user_agent super(AmazonSpider, self).__init__(*args, **kwargs) if site_name is None: assert len(self.allowed_domains) == 1, \ "A single allowed domain is required to auto-detect site name." self.site_name = self.allowed_domains[0] else: self.site_name = site_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter if quantity is None: self.log("No quantity specified. Will retrieve all products.", INFO) import sys self.quantity = sys.maxint else: self.quantity = int(quantity) if page is None: self.log("No page specified. Will retrieve all products.", INFO) import sys self.page = sys.maxint else: self.page = int(page) self.product_url = product_url self.searchterms = [] if searchterms_str is not None: self.searchterms = searchterms_str.decode('utf-8').split(',') elif searchterms_fn is not None: with open(searchterms_fn, encoding='utf-8') as f: self.searchterms = f.readlines() else: self.log("No search terms provided!", ERROR) self.log("Created for %s with %d search terms." % (self.site_name, len(self.searchterms)), INFO) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def make_requests_from_url(self, _): """This method does not apply to this type of spider so it is overriden and "disabled" by making it raise an exception unconditionally. """ raise AssertionError("Need a search term.") def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), meta={'search_term': st, 'remaining': self.quantity}, ) def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = self.parse_without_captcha(response) return result def parse_without_captcha(self, response): if self._search_page_error(response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] self.log("For search term '%s' with %d items remaining," " failed to retrieve search page: %s" % (search_term, remaining, response.request.url), WARNING) else: prods_count = -1 # Also used after the loop. for prods_count, request_or_prod in enumerate( self._get_products(response)): yield request_or_prod prods_count += 1 # Fix counter. request = self._get_next_products_page(response, prods_count) if request is not None: yield request def _get_products(self, response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] total_matches = response.meta.get('total_matches') prods = self._scrape_product_links(response) if total_matches is None: total_matches = self._scrape_total_matches(response) if total_matches is not None: response.meta['total_matches'] = total_matches self.log("Found %d total matches." % total_matches, INFO) else: if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to parse total matches for %s" % response.url,ERROR) for i, (prod_item) in enumerate(islice(prods, 0, remaining)): prod_item['keyword'] = search_term prod_item['total_matches'] = total_matches prod_item['rank'] = (i + 1) + (self.quantity - remaining) yield prod_item def _get_next_products_page(self, response, prods_found): page_number = int(response.meta.get('page_number', 1)) link_page_attempt = response.meta.get('link_page_attempt', 1) result = None if prods_found is not None: # This was a real product listing page. if page_number < self.page: remaining = response.meta['remaining'] remaining -= prods_found next_page = self._scrape_next_results_page_link(response) if next_page is None: pass else: url = urlparse.urljoin(response.url, next_page) new_meta = dict(response.meta) new_meta['remaining'] = remaining new_meta['page_number'] = page_number + 1 result = Request(url, self.parse, meta=new_meta, priority=1) elif link_page_attempt > self.MAX_RETRIES: self.log( "Giving up on results page after %d attempts: %s" % ( link_page_attempt, response.request.url), ERROR ) else: self.log( "Will retry to get results page (attempt %d): %s" % ( link_page_attempt, response.request.url), WARNING ) # Found no product links. Probably a transient error, lets retry. new_meta = response.meta.copy() new_meta['link_page_attempt'] = link_page_attempt + 1 result = response.request.replace( meta=new_meta, cookies={}, dont_filter=True) return result def _scrape_total_matches(self, response): if response.css('#noResultsTitle'): return 0 values = response.css('#s-result-count ::text').re( '([0-9,]+)\s[Rr]esults for') if not values: values = response.css('#resultCount > span ::text').re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults') if not values: values = response.css( '#result-count-only-next' ).xpath( 'comment()' ).re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+' ) if values: total_matches = int(values[0].replace(',', '')) else: if not self.is_nothing_found(response): self.log( "Failed to parse total number of matches for: %s" % response.url, level=ERROR ) total_matches = None return total_matches def _scrape_product_links(self, response): products = response.xpath('//li[@class="s-result-item"]') for pr in products: if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")]'): continue product = ProductItem() cond_set(product, 'title', pr.xpath('.//h2/../@title').extract()) cond_set(product, 'product_image', pr.xpath('.//img[@alt="Product Details"]/@src').extract()) cond_set(product, 'brand', pr.xpath( './/div[@class="a-fixed-left-grid-col a-col-right"]' '/div/div/span[2]/text()').extract()) cond_set(product, 'price', pr.xpath( './/span[contains(@class,"s-price")]/text()' ).extract()) cond_set(product, 'asin', pr.xpath('@data-asin').extract()) if pr.xpath('.//i[contains(@class, "a-icon-prime")]'): cond_set_value(product, 'prime', True) else: cond_set_value(product, 'prime', False) cond_set(product, 'shipping_price', pr.xpath( './/span[contains(@class,"s-price")]/' 'following::span[2]/text()').re('(\d+.?\d+) shipping')) new = pr.xpath('.//a[contains(text(),"new")]/span/text()') if new: cond_set(product, 'new_price', new.extract()) cond_set(product, 'new_offers', new[1].re('\d+')) used = pr.xpath('.//a[contains(text(),"used")]/span/text()') if used: cond_set(product, 'used_price', used.extract()) cond_set(product, 'used_offers', used[1].re('\d+')) cond_set(product, 'rating', pr.xpath( './/span[contains(@name,"'+product['asin']+'")]/span/a/i/span' ).re('(\d+.?\d+)')) cond_set(product, 'number_of_reviews', pr.xpath( './/span[contains(@name,"'+product['asin']+'")]/' 'following::a[1]/text()').re('([\d+,?]+\d+)')) cond_set(product, 'category', pr.xpath( './/span[contains(@class,"a-text-bold")]/text()' ).re('(.*):')) number_of_items = pr.xpath( './/span[contains(@class,"a-text-bold")]/../text()' ).re('([\d+,?]+\d+)') if number_of_items: cond_set_value(product, 'number_of_items', number_of_items[0]) # product['url'] = pr.xpath('.//h2/../@href')[0].extract() # cond_set(product, 'url', pr.xpath('.//h2/../@href').extract()) yield product def _scrape_next_results_page_link(self, response): next_pages = response.css('#pagnNextLink ::attr(href)').extract() next_page_url = None if len(next_pages) == 1: next_page_url = next_pages[0] elif len(next_pages) > 1: self.log("Found more than one 'next page' link.", ERROR) return next_page_url def is_nothing_found(self, response): txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract() txt = ''.join(txt) return 'did not match any products' in txt def _search_page_error(self, response): body = response.body_as_unicode() return "Your search" in body \ and "did not match any products." in body # Captcha handling functions. def _has_captcha(self, response): return '.images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( url, captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, url), level=INFO ) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
class AmazonBaseClass(Spider): def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonBaseClass, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = self.parse_without_captcha(response) return result def parse_links(self, response): """ Handles parsing of a top reviewers page. :param response: :return: ReviewItem's with Rank """ raise NotImplementedError def parse_profile(self, response): """ Handles parsing of a reviewer profile page. :param response: :return: ReviewItem's with Email, Name and Country """ raise NotImplementedError def parse_without_captcha(self, response): if not self._has_captcha(response): res = self.parse_links(response) for i in res: yield i else: result = self._handle_captcha(response, self.parse_without_captcha) yield result def parse_email(self, response): if not self._has_captcha(response): result = self.parse_profile(response) if result: return result else: result = self._handle_captcha(response, self.parse_email) return result # Captcha handling functions. def _has_captcha(self, response): return 'images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log( "Failed to guess captcha for '%s' (try: %d)." % ( url, captcha_solve_try), level=ERROR ) result = None else: self.log( "On try %d, submitting captcha '%s' for '%s'." % ( captcha_solve_try, captcha, url), level=INFO ) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
class AmazonSpider(BaseSpider): name = "amazon" allowed_domains = ["amazon.com"] start_urls = [ "http://www.amazon.com/gp/site-directory/ref=sa_menu_top_fullstore" ] def __init__(self, outfile=None, test_category=None): self.outfile = outfile # if this is set, only crawl this category (level 2/1 category name). used for testing self.test_category = test_category # if test category is set and no output file was specified, set the name of outfile to a special "test" name if self.test_category and not self.outfile: self.outfile = "amazon_categories_test.jl" # level that is considered to contain departments self.DEPARTMENT_LEVEL = 2 # hardcoded toplevel categories (level 1 and 2) urls to replace/supplement some of the ones found on the sitemap above (point to the same category, but have different page content. they were found manually) # reason: they provide more info regarding product count than the ones found on the sitemap # keys are categories names as found in the sitemap, values are URLs associated with them, that will replace/supplement the links found on the sitemap self.EXTRA_TOPLEVEL_CATEGORIES_URLS = { "Baby" : "http://www.amazon.com/s/ref=lp_166835011_ex_n_1?rh=n%3A165796011&bbn=165796011&ie=UTF8&qid=1393338541", \ "Electronics & Computers" : "http://www.amazon.com/s/ref=lp_172659_ex_n_1?rh=n%3A172282&bbn=172282&ie=UTF8&qid=1393338741", \ "Home, Garden & Tools" : "http://www.amazon.com/s/ref=lp_284507_ex_n_1?rh=n%3A1055398&bbn=1055398&ie=UTF8&qid=1393338782",\ "Kindle E-readers & Books" : "http://www.amazon.com/s/ref=lp_154606011_ex_n_1?rh=n%3A133140011&bbn=133140011&ie=UTF8&qid=1395704970", \ "Apps & Games" : "http://www.amazon.com/b/ref=sd_allcat_fire_apps_games?ie=UTF8&node=3427287011", \ "Movies & TV" : "http://www.amazon.com/action-adventure-dvd-bluray/b/ref=MoviesHPBB_Genres_Action?ie=UTF8&node=2650363011&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-left-2&pf_rd_r=0GAWFEZ3EXP8PEYCM6X3&pf_rd_t=101&pf_rd_p=1753817742&pf_rd_i=2625373011", \ "All Beauty" : "http://www.amazon.com/s/ref=lp_11059031_ex_n_1?rh=n%3A3760911&bbn=3760911&ie=UTF8&qid=1395793680",\ "Health, Household & Baby Care" : "http://www.amazon.com/s/ref=lp_6183682011_ex_n_1?rh=n%3A3760901&bbn=3760901&ie=UTF8&qid=1395822180", \ "Tires & Wheels" : "http://www.amazon.com/s/ref=lp_353609011_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A15706571&bbn=15706571&ie=UTF8&qid=1395824546", \ "Motorcycle & Powersports" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181%2Cn%3A%2115690151%2Cn%3A346333011&bbn=346333011&ie=UTF8&qid=1395824599", \ "Automotive & Industrial" : "http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A15684181&bbn=15684181" # this is partial - "Automotive and industrial" also contains the "Industrial & Scientific" cats which can be found in the sitemap } # flag indicating whether to compute overall product counts in pipelines phase for this spider. # if on, 'catid' and 'parent_catid' fields need to be implemented self.compute_nrproducts = True # counter for department id, will be used to autoincrement department id self.department_count = 0 # counter for category id self.catid = 0 # level to stop crawling (don't extract subcategories below this level) self.LEVEL_BARRIER = -2 # maximum number of retries when presented with captcha form self.MAX_CAPTCHA_RETRY = 10 # dictionarties associating department names with other attributes - to use for setting parent category info for level 1 categories # associates department names with their ids self.departments_ids = {} # associates department names with their urls (will be available only for extra_categories) self.department_urls = {} # associate department names with their category ids self.departments_cat_ids = {} # captcha breaker self.CB = CaptchaBreakerWrapper() # solve the captcha on this page and redirect back to method that sent us here (callback) def solve_captcha_and_redirect(self, response, callback): hxs = HtmlXPathSelector(response) # solve captcha captcha_text = None image = hxs.select(".//img/@src").extract() if image: captcha_text = self.CB.solve_captcha(image[0]) # value to use if there was an exception if not captcha_text: captcha_text = '' # create a FormRequest to this same URL, with everything needed in meta # items, cookies and search_urls not changed from previous response so no need to set them again # redirect to initial URL #return [FormRequest.from_response(response, callback = callback, formdata={'field-keywords' : captcha_text})] meta = response.meta # decrease count for retry times left. if not set yet, this is first attempt, set it to MAX_CAPTCHA_RETRY response.meta['retry_count'] = response.meta[ 'retry_count'] - 1 if 'retry_count' in response.meta else self.MAX_CAPTCHA_RETRY return FormRequest.from_response( response, callback=callback, formdata={'field-keywords': captcha_text}, meta=meta) # test if page is form containing captcha def has_captcha(self, body): return '.images-amazon.com/captcha/' in body # check if 2 catgory names are the same # does some normalization of the names and compares the words in them # to be used for identifying EXTRA_TOPLEVEL_CATEGORIES_URLS when they occur in the sitemap def is_same_name(self, name1, name2): # eliminate non-word characters name1 = re.sub("[^a-zA-Z]", " ", name1).lower() name2 = re.sub("[^a-zA-Z]", " ", name2).lower() name1_words = name1.split() name2_words = name2.split() return set(name1_words) == set(name2_words) # find key in dict using is_same_name as equality function (return key from dict where is_same_name returns true for given target_key) def find_matching_key(self, target_key, dictionary): for key in dictionary: if self.is_same_name(target_key, key): return key return None # start parsing of top level categories extracted from sitemap; pass them to parseCategory def parse(self, response): hxs = HtmlXPathSelector(response) # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it if self.has_captcha( response.body) and ('retry_count' not in response.meta or response.meta['retry_count'] > 0): yield self.solve_captcha_and_redirect( response, self.parse ) # meta of response will contain number of retries left if set return links_level1 = hxs.select("//div[@id='siteDirectory']//table//a") titles_level1 = hxs.select("//div//table//h2") # add level 1 categories to items # first one is a special category ("Unlimited Instant Videos"), add it separately special_item = CategoryItem() special_item['text'] = titles_level1[0].select('text()').extract()[0] special_item['level'] = 2 special_item['special'] = 1 special_item['department_text'] = special_item['text'] special_item['department_id'] = self.department_count self.department_count += 1 special_item['catid'] = self.catid self.catid += 1 self.departments_ids[ special_item['text']] = special_item['department_id'] self.departments_cat_ids[special_item['text']] = special_item['catid'] #yield special_item # if test category is set, and this is not it, ignore if not self.test_category or special_item['text'] == self.test_category: yield special_item # the rest of the titles are not special for title in titles_level1[1:]: item = CategoryItem() item['text'] = title.select('text()').extract()[0] item['level'] = 2 item['department_text'] = item['text'] item['department_id'] = self.department_count self.department_count += 1 item['catid'] = self.catid self.catid += 1 self.departments_ids[item['text']] = item['department_id'] self.departments_cat_ids[item['text']] = item['catid'] # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, add info from that url extra_category = self.find_matching_key( item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS) if extra_category: item['url'] = self.EXTRA_TOPLEVEL_CATEGORIES_URLS[ extra_category] item['department_url'] = item['url'] self.department_urls[item['text']] = item['url'] # if self.test_category is set, only send request if this is the test category if self.test_category and item['text'] != self.test_category: continue # parse this category further yield Request(item['url'], callback=self.parseCategory, meta={'item': item}) else: # if test category is set and this is not it, ignore if self.test_category and item['text'] != self.test_category: continue yield item # add level 1 categories to items for link in links_level1: item = CategoryItem() item['text'] = link.select('text()').extract()[0] root_url = "http://www.amazon.com" item['url'] = root_url + link.select('@href').extract()[0] item['level'] = 1 parent = link.select( "parent::node()/parent::node()/preceding-sibling::node()") parent_text = parent.select('text()').extract() # category should have a parent (its department) and that parent should have been extracted earlier (above) and put in the ids dictionary, necessary for getting the department id assert parent_text assert parent_text[0] in self.departments_ids if parent_text: item['parent_text'] = parent_text[0] item['department_text'] = item['parent_text'] item['department_id'] = self.departments_ids[ item['department_text']] item['parent_catid'] = self.departments_cat_ids[ item['department_text']] item['catid'] = self.catid self.catid += 1 # get department url from department_urls, will be availble only for extra_categories if item['department_text'] in self.department_urls: assert self.find_matching_key( item['department_text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS) item['department_url'] = self.department_urls[ item['department_text']] item['parent_url'] = item['department_url'] #TODO: leave this or not? # Don't crawl subcategories of departments twice. If this is a department with url (extra_category), then we will crawl its subcategories. So ignore them here #continue # if its parent is the special category, mark this one as special too if (item['parent_text'] == special_item['text']): item['special'] = 1 special = True else: special = False # department_id = self.department_count # self.department_count += 1 # item['department_text'] = item['text'] # item['department_url'] = item['url'] # item['department_id'] = department_id # if self.test_category is set, only send request if this is the test category if self.test_category and item['text'] != self.test_category: continue yield Request(item['url'], callback=self.parseCategory, meta={'item': item}) # parse category and return item corresponding to it (for categories where URL available - level 2 and lower) def parseCategory(self, response): # if we are getting blocked by captcha, solve and redirect back here # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it if self.has_captcha( response.body) and ('retry_count' not in response.meta or response.meta['retry_count'] > 0): yield self.solve_captcha_and_redirect( response, self.parseCategory ) # meta of response will contain number of retries left if set return hxs = HtmlXPathSelector(response) # extract additional info for received parent and return it item = response.meta['item'] # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url) if 'nr_products' not in item: prod_count_holder = hxs.select( "//h2[@class='resultCount']/span/text()").extract() if prod_count_holder: prod_count = prod_count_holder[0] # extract number # for paged results: Showing ... out of ... Results m = re.match(".*\s*of\s+([0-9,]+)\s+Results\s*", prod_count) # for one page results: Showing ... Result(s) if not m: m = re.match(".*\s+([0-9,]+)\s+Results?\s*", prod_count) if m: item['nr_products'] = int(re.sub(",", "", m.group(1))) # extract description if available # only extracts descriptions that contain a h2. is that good? desc_holders = hxs.select( "//div[@class='unified_widget rcmBody'][descendant::h2][last()]") # select the one among these with the most text #TODO: another idea: check if the holder has a h2 item if desc_holders: maxsize = 0 max_desc_holder = desc_holders[0] for desc_holder in desc_holders: size = len(" ".join(desc_holder.select(".//text()").extract())) if size > maxsize: maxsize = size max_desc_holder = desc_holder desc_holder = max_desc_holder desc_title = desc_holder.select("h2/text()").extract() if desc_title: item['description_title'] = desc_title[0].strip() description_texts = desc_holder.select( ".//text()[not(ancestor::h2)]").extract() # if the list is not empty and contains at least one non-whitespace item # if there is a description title or the description body is large enough size_threshold = 50 if (description_texts and reduce(lambda x, y: x or y, [line.strip() for line in description_texts])): # and \ #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)): # replace all whitespace with one space, strip, and remove empty texts; then join them item['description_text'] = " ".join([ re.sub("\s+", " ", description_text.strip()) for description_text in description_texts if description_text.strip() ]) tokenized = Utils.normalize_text(item['description_text']) item['description_wc'] = len(tokenized) if desc_title: (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq( item['description_title'], item['description_text']) else: item['description_wc'] = 0 else: item['description_wc'] = 0 # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, and no product count was found, add info from that url extra_category = self.find_matching_key( item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS) # crawl lower level categories if item['level'] > self.LEVEL_BARRIER: if extra_category: # collect number of products from this alternate URL # this will also extract subcategories and their count yield Request( self.EXTRA_TOPLEVEL_CATEGORIES_URLS[extra_category], callback=self.extractSubcategories, meta={'item': item}) else: # extract subcategories and their count for category even if not in extra_... yield Request(item['url'], callback=self.extractSubcategories, meta={'item': item}) else: yield item # extract and yield subcategories for a category # use menu on left side of the page on the category page # will mainly be used for categories in EXTRA_TOPLEVEL_CATEGORIES_URLS # after subcategories extracted, send them to parseCategory to extract description as well # Obs: it's not exhaustive. if page doesn't match what it expects, it gives up def extractSubcategories(self, response): # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it if self.has_captcha( response.body) and ('retry_count' not in response.meta or response.meta['retry_count'] > 0): yield self.solve_captcha_and_redirect( response, self.extractSubcategories ) # meta of response will contain number of retries left if set return hxs = HtmlXPathSelector(response) # returned received item, then extract its subcategories parent_item = response.meta['item'] yield parent_item # extract subcategories, if level is above barrier # extract subcategories from first menu on the left, assume this is the subcategories menu if parent_item['level'] > self.LEVEL_BARRIER: # check if it should be treated as a special category (exceptions to usual page structure); then extract the subcategories with the appropriate method if self.isSpecialCategoryMenu(parent_item): subcategories = self.extractSubcategoriesFromMenuSpecial( hxs, parent_item) # if no subcategories were found, try with the regular extraction as well (ex http://www.amazon.com/clothing-accessories-men-women-kids/b/ref=sd_allcat_apr/179-7724806-1781144?ie=UTF8&node=1036592) if not subcategories: subcategories = self.extractSubcategoriesFromMenu(hxs) else: subcategories = self.extractSubcategoriesFromMenu(hxs) for (subcategory_text, subcategory_url, subcategory_prodcount) in subcategories: item = CategoryItem() item['url'] = subcategory_url item['text'] = subcategory_text item['catid'] = self.catid self.catid += 1 if subcategory_prodcount: item['nr_products'] = int(subcategory_prodcount) item['parent_text'] = parent_item['text'] item['parent_url'] = parent_item['url'] item['parent_catid'] = parent_item['catid'] # considering departments to be level 2 categories (top level) - so every category must have a department text assert 'department_text' in parent_item if 'department_text' in parent_item: item['department_text'] = parent_item['department_text'] #item['department_url'] = parent_item['department_url'] item['department_id'] = parent_item['department_id'] # only level 2 categories in extra_categories have department_url if 'department_url' in parent_item: item['department_url'] = parent_item['department_url'] else: assert not self.find_matching_key( item['department_text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS) # else: # # the parent must be a level 2 category - so this will be considered department # assert parent_item['level'] == 2 # item['department_text'] = item['text'] # #item['department_url'] = item['url'] # item['department_id'] = self.department_count # self.department_count += 1 item['level'] = parent_item['level'] - 1 # # no description extracted # item['description_wc'] = 0 # send to parseCategory to extract description as well yield Request(item['url'], callback=self.parseCategory, meta={'item': item}) # given a page (selector for it), extract subcategories from menu on the left # return generator of tuples representing subcategories with (name, url, item count) def extractSubcategoriesFromMenu(self, hxs): # extract subcategories for regular page structure subcategories = hxs.select( "//h2[text()='Department']/following-sibling::ul[1]/li/a") # only try "Shop by Department" if there is no "Department", otherwise might cause problems when both are present. e.g (http://www.amazon.com/Watches-Mens-Womens-Kids-Accessories/b/ref=sd_allcat_watches/187-9021585-5419616?ie=UTF8&node=377110011) if not subcategories: subcategories = hxs.select( "(//h2 | //h3)[text()='Shop by Department']/following-sibling::ul[1]/li/a" ) for subcategory in subcategories: # if we have a subcategory URL and product count with the expected format extract it, otherwise move on # there is an exception to this refinement link rule - then extract info directly from subcategory node, but only if len(text)>1 (otherwise we catch all the little arrows for parent cats) if not subcategory.select("span[@class='refinementLink']"): if len(subcategory.select(".//text()").extract() [0].strip()) > 1: # so it's not that little arrow thing subcategory_text_holder = subcategory.select( "text()[normalize-space()!='']").extract() if subcategory_text_holder: subcategory_text = subcategory_text_holder[0].strip() else: continue subcategory_url_holder = subcategory.select( "@href").extract() if subcategory_url_holder: subcategory_url = Utils.add_domain( subcategory_url_holder[0], "http://www.amazon.com") else: continue subcategory_prodcount_holder = None else: continue else: subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") subcategory_text = subcategory.select( "span[@class='refinementLink']//text()").extract( )[0].strip() # extract product count, clean it of commas and parantheses subcategory_prodcount_holder = subcategory.select( "span[@class='narrowValue']/text()").extract() # if there's also product count available in the menu, extract it if subcategory_prodcount_holder: subcategory_prodcount = subcategory_prodcount_holder[ 0].replace(";nbsp&", " ").strip() m = re.match("\(([0-9,]+)\)", subcategory_prodcount) if m: subcategory_prodcount = m.group(1).replace(",", "") else: subcategory_prodcount = None yield (subcategory_text, subcategory_url, subcategory_prodcount) # extract subcategories from category page from special category pages that do not conform to regular page structure # return list of nodes containing the subcategories # check which category this is and send to specific method for extracting subcategories def extractSubcategoriesFromMenuSpecial(self, hxs, category): cat_title = category['text'] if cat_title in ["Team Sports", "All Sports & Outdoors"]: return self.extractSubcategoriesSports(hxs) if category['text'] == 'Accessories' and ("Clothing" in category['parent_text']): return self.extractSubcategoriesAccessories(hxs) # extract subcategories for special category: Sports def extractSubcategoriesSports(self, hxs): subcategories = hxs.select( "//h3[text()='Shop by Sport']/following-sibling::ul[1]/li/a") for subcategory in subcategories: subcategory_name = subcategory.select("text()").extract()[0] subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") yield (subcategory_name, subcategory_url, None) # extract subcategories for special category: Accessories in Clothing def extractSubcategoriesAccessories(self, hxs): subcategories = hxs.select("//a[contains(text(),'Shop All')]") for subcategory in subcategories: # extract words after "Shop All" - that is the subcategory name subcategory_text_full = subcategory.select("text()").extract()[0] m = re.match("Shop All (.*)", subcategory_text_full) subcategory_name = m.group(1).strip() subcategory_url = Utils.add_domain( subcategory.select("@href").extract()[0], "http://www.amazon.com") yield (subcategory_name, subcategory_url, None) # check if category is special and subcategories from its menu should be extracted in a specific way #TODO: replace these tests with some tests based on URL, more robust (after figuring out which is the stable part of the url) def isSpecialCategoryMenu(self, category): # category names with special page structure whose subcategories menu need to be parsed specifically # these are the titles found on the respective categories' pages SUBCATS_MENU_SPECIAL = ['Team Sports', 'All Sports & Outdoors'] if category['text'] in SUBCATS_MENU_SPECIAL: return True if category['text'] == 'Accessories' and ("Clothing" in category['parent_text']): #print "IS SPECIAL", category['url'] return True
class AmazonSpider(Spider): name = 'amazon' allowed_domains = ["amazon.com"] start_urls = [] # SEARCH_URL = 'http://www.amazon.com/s/ref=sr_as_oo?' \ # 'rh=i%3Aaps%2Ck%3A{search_term}&keywords={search_term}' SEARCH_URL = 'http://www.amazon.com/s/ref=nb_sb_noss_2' \ '?url=search-alias%3Daps&field-keywords={search_term}' MAX_RETRIES = 3 user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko' '/20100101 Firefox/35.0') USER_AGENTS = { 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 'Gecko/20100101 Firefox/35.0', 'desktop': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) '\ 'Gecko/20100101 Firefox/35.0', 'iphone_ipad': 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_6 '\ 'like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) '\ 'Version/7.0 Mobile/11B651 Safari/9537.53', 'android_phone': 'Mozilla/5.0 (Android; Mobile; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', 'android_pad': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', 'android': 'Mozilla/5.0 (Android; Tablet; rv:35.0) '\ 'Gecko/35.0 Firefox/35.0', } def __init__(self, url_formatter=None, quantity=None, page=None, searchterms_str='laptop', searchterms_fn=None, site_name=None, product_url=None, user_agent=None, captcha_retries='10', *args, **kwargs): if user_agent is None or user_agent not in self.USER_AGENTS.keys(): self.log( "Not available user agent type or it wasn't set." " Default user agent will be used.", INFO) user_agent = 'default' if user_agent: self.user_agent = self.USER_AGENTS[user_agent] self.user_agent_key = user_agent super(AmazonSpider, self).__init__(*args, **kwargs) if site_name is None: assert len(self.allowed_domains) == 1, \ "A single allowed domain is required to auto-detect site name." self.site_name = self.allowed_domains[0] else: self.site_name = site_name if url_formatter is None: self.url_formatter = string.Formatter() else: self.url_formatter = url_formatter if quantity is None: self.log("No quantity specified. Will retrieve all products.", INFO) import sys self.quantity = sys.maxint else: self.quantity = int(quantity) if page is None: self.log("No page specified. Will retrieve all products.", INFO) import sys self.page = sys.maxint else: self.page = int(page) self.product_url = product_url self.searchterms = [] if searchterms_str is not None: self.searchterms = searchterms_str.decode('utf-8').split(',') elif searchterms_fn is not None: with open(searchterms_fn, encoding='utf-8') as f: self.searchterms = f.readlines() else: self.log("No search terms provided!", ERROR) self.log( "Created for %s with %d search terms." % (self.site_name, len(self.searchterms)), INFO) self.captcha_retries = int(captcha_retries) self._cbw = CaptchaBreakerWrapper() def make_requests_from_url(self, _): """This method does not apply to this type of spider so it is overriden and "disabled" by making it raise an exception unconditionally. """ raise AssertionError("Need a search term.") def start_requests(self): """Generate Requests from the SEARCH_URL and the search terms.""" for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), ), meta={ 'search_term': st, 'remaining': self.quantity }, ) def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = self.parse_without_captcha(response) return result def parse_without_captcha(self, response): if self._search_page_error(response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] self.log( "For search term '%s' with %d items remaining," " failed to retrieve search page: %s" % (search_term, remaining, response.request.url), WARNING) else: prods_count = -1 # Also used after the loop. for prods_count, request_or_prod in enumerate( self._get_products(response)): yield request_or_prod prods_count += 1 # Fix counter. request = self._get_next_products_page(response, prods_count) if request is not None: yield request def _get_products(self, response): remaining = response.meta['remaining'] search_term = response.meta['search_term'] total_matches = response.meta.get('total_matches') prods = self._scrape_product_links(response) if total_matches is None: total_matches = self._scrape_total_matches(response) if total_matches is not None: response.meta['total_matches'] = total_matches self.log("Found %d total matches." % total_matches, INFO) else: if hasattr(self, 'is_nothing_found'): if not self.is_nothing_found(response): self.log( "Failed to parse total matches for %s" % response.url, ERROR) for i, (prod_item) in enumerate(islice(prods, 0, remaining)): prod_item['keyword'] = search_term prod_item['total_matches'] = total_matches prod_item['rank'] = (i + 1) + (self.quantity - remaining) yield prod_item def _get_next_products_page(self, response, prods_found): page_number = int(response.meta.get('page_number', 1)) link_page_attempt = response.meta.get('link_page_attempt', 1) result = None if prods_found is not None: # This was a real product listing page. if page_number < self.page: remaining = response.meta['remaining'] remaining -= prods_found next_page = self._scrape_next_results_page_link(response) if next_page is None: pass else: url = urlparse.urljoin(response.url, next_page) new_meta = dict(response.meta) new_meta['remaining'] = remaining new_meta['page_number'] = page_number + 1 result = Request(url, self.parse, meta=new_meta, priority=1) elif link_page_attempt > self.MAX_RETRIES: self.log( "Giving up on results page after %d attempts: %s" % (link_page_attempt, response.request.url), ERROR) else: self.log( "Will retry to get results page (attempt %d): %s" % (link_page_attempt, response.request.url), WARNING) # Found no product links. Probably a transient error, lets retry. new_meta = response.meta.copy() new_meta['link_page_attempt'] = link_page_attempt + 1 result = response.request.replace(meta=new_meta, cookies={}, dont_filter=True) return result def _scrape_total_matches(self, response): if response.css('#noResultsTitle'): return 0 values = response.css('#s-result-count ::text').re( '([0-9,]+)\s[Rr]esults for') if not values: values = response.css('#resultCount > span ::text').re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults') if not values: values = response.css('#result-count-only-next').xpath( 'comment()').re('\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+') if values: total_matches = int(values[0].replace(',', '')) else: if not self.is_nothing_found(response): self.log("Failed to parse total number of matches for: %s" % response.url, level=ERROR) total_matches = None return total_matches def _scrape_product_links(self, response): products = response.xpath('//li[@class="s-result-item"]') for pr in products: if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")] |' './/h5[contains(text(), "Sponsored")]'): continue product = ProductItem() cond_set(product, 'title', pr.xpath('.//h2/../@title').extract()) cond_set(product, 'product_image', pr.xpath('.//img[@alt="Product Details"]/@src').extract()) cond_set( product, 'brand', pr.xpath('.//div[@class="a-fixed-left-grid-col a-col-right"]' '/div/div/span[2]/text() |' './/div[@class="a-row a-spacing-mini"]/span[2]/text()' ).extract()) cond_set( product, 'price', pr.xpath( './/span[contains(@class,"s-price")]/text()').extract()) cond_set(product, 'asin', pr.xpath('@data-asin').extract()) if pr.xpath('.//i[contains(@class, "a-icon-prime")]'): cond_set_value(product, 'prime', True) else: cond_set_value(product, 'prime', False) cond_set( product, 'shipping_price', pr.xpath( './/span[contains(@class,"s-price")]/' 'following::span[2]/text()').re('(\d+.?\d+) shipping')) new = pr.xpath('.//a[contains(text(),"new")]/span/text()') if new: cond_set(product, 'new_price', new.extract()) cond_set(product, 'new_offers', new[1].re('\d+')) used = pr.xpath('.//a[contains(text(),"used")]/span/text()') if used: cond_set(product, 'used_price', used.extract()) cond_set(product, 'used_offers', used[1].re('\d+')) cond_set( product, 'rating', pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/span/a/i/span').re('(\d+.?\d+)')) cond_set( product, 'number_of_reviews', pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/' 'following::a[1]/text()').re('([\d+,?]+\d+)')) category = pr.xpath( './/span[contains(@class,"a-text-bold")]/text()').re('(.*):') if not category: category = response.xpath( '//div[@id="autoscoping-backlink"]/div/span/span/text()' ).extract() cond_set(product, 'category', category) number_of_items = pr.xpath( './/span[contains(@class,"a-text-bold")]/../text()').re( '([\d+,?]+\d+)') if number_of_items: cond_set_value(product, 'number_of_items', number_of_items[0]) else: cond_set_value(product, 'number_of_items', response.meta.get('total_matches')) product['all_brands'] = response.xpath( '//h2[text()="Brand"]/following::ul[1]/' 'li[@class="refinementImage"]/a/span/text()').extract() yield product def _scrape_next_results_page_link(self, response): next_pages = response.css('#pagnNextLink ::attr(href)').extract() next_page_url = None if len(next_pages) == 1: next_page_url = next_pages[0] elif len(next_pages) > 1: self.log("Found more than one 'next page' link.", ERROR) return next_page_url def is_nothing_found(self, response): txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract() txt = ''.join(txt) return 'did not match any products' in txt def _search_page_error(self, response): body = response.body_as_unicode() return "Your search" in body \ and "did not match any products." in body # Captcha handling functions. def _has_captcha(self, response): return '.images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log("Failed to guess captcha for '%s' (try: %d)." % (url, captcha_solve_try), level=ERROR) result = None else: self.log("On try %d, submitting captcha '%s' for '%s'." % (captcha_solve_try, captcha, url), level=INFO) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result
print str(e) try: from captcha_solver import CaptchaBreakerWrapper except Exception as e: print '!!!!!!!!Captcha breaker is not available due to: %s' % e class CaptchaBreakerWrapper(object): @staticmethod def solve_captcha(url): msg("CaptchaBreaker in not available for url: %s" % url, level=WARNING) return None _cbw = CaptchaBreakerWrapper() def _has_captcha(response): return '.images-amazon.com/captcha/' in response.content def _solve_captcha(response): soup = BeautifulSoup(response.content, "html.parser") forms = soup.findAll(itemprop="image") assert len(forms) == 1, "More than one form found." captcha_img = forms[0]['src'] return _cbw.solve_captcha(captcha_img)
class AmazonProductsSpider(AmazonTests, BaseProductsSpider): name = 'amazon_products' allowed_domains = ["amazon.com"] user_agent = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:35.0) Gecko' '/20100101 Firefox/35.0') SEARCH_URL = ('http://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias' '%3Daps&field-keywords={search_term}') settings = AmazonValidatorSettings buyer_reviews_stars = [ 'one_star', 'two_star', 'three_star', 'four_star', 'five_star' ] def __init__(self, captcha_retries='10', *args, **kwargs): super(AmazonProductsSpider, self).__init__(*args, **kwargs) self.captcha_retries = int(captcha_retries) self.mtp_class = Amazon_marketplace(self) self._cbw = CaptchaBreakerWrapper() def parse(self, response): if self._has_captcha(response): result = self._handle_captcha(response, self.parse) else: result = super(AmazonProductsSpider, self).parse(response) return result def _get_products(self, response): result = super(AmazonProductsSpider, self)._get_products(response) for r in result: if isinstance(r, Request): r = r.replace(dont_filter=True) yield r else: yield r def parse_product(self, response): prod = response.meta['product'] prod['buyer_reviews'] = self._build_buyer_reviews(response) if not self._has_captcha(response): self._populate_from_js(response, prod) self._populate_from_html(response, prod) cond_set_value(prod, 'locale', 'en-US') # Default locale. mkt_place_link = urlparse.urljoin( response.url, is_empty(response.xpath( "//div[contains(@class, 'a-box-inner')]" \ "//a[contains(@href, '/gp/offer-listing/')]/@href |" \ "//div[@id='secondaryUsedAndNew']" \ "//a[contains(@href, '/gp/offer-listing/')]/@href" ).extract())) new_meta = response.meta.copy() new_meta['product'] = prod if isinstance(prod["buyer_reviews"], Request): if mkt_place_link: new_meta["mkt_place_link"] = mkt_place_link return prod["buyer_reviews"].replace(meta=new_meta, dont_filter=True) if mkt_place_link: return Request(url=mkt_place_link, callback=self.parse_marketplace, meta=new_meta, dont_filter=True) result = prod elif response.meta.get('captch_solve_try', 0) >= self.captcha_retries: self.log("Giving up on trying to solve the captcha challenge after" " %s tries for: %s" % (self.captcha_retries, prod['url']), level=WARNING) result = None else: result = self._handle_captcha(response, self.parse_product) return result def _get_price(self, response, product): """ Parses and sets the product price, with all possible variations :param response: Scrapy's Response obj :param product: Scrapy's Item (dict, basically) :return: None """ cond_set( product, 'price', response.css('#priceblock_ourprice ::text' ', #unqualifiedBuyBox .a-color-price ::text' ', #priceblock_saleprice ::text' ', #actualPriceValue ::text' ', #buyNewSection .offer-price ::text').extract(), ) if not product.get('price', None): cond_set( product, 'price', response.xpath( '//td/b[@class="priceLarge"]/text() |' '//span[@class="olp-padding-right"]' '/span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_hd_movie")]' '/button/text() |' '//span[@id="priceblock_saleprice"]/text() |' '//li[@class="swatchElement selected"]' '//span[@class="a-color-price"]/text() |' '//div[contains(@data-reftag,"atv_dp_bb_est_sd_movie")]' '/button/text() |' '//div[@id="mocaBBRegularPrice"]' '/div/text()[normalize-space()]').extract()) if product.get('price', None): if not '$' in product['price']: if 'FREE' in product['price'] or ' ' in product['price']: product['price'] = Price(priceCurrency='USD', price='0.00') else: self.log('Currency symbol not recognized: %s' % response.url, level=ERROR) else: price = re.findall('[\d ,.]+\d', product['price']) price = re.sub('[, ]', '', price[0]) product['price'] = Price( priceCurrency='USD', price=price.replace('$', '').strip()\ .replace(',', '') ) def populate_bestseller_rank(self, product, response): ranks = { ' > '.join( map(unicode.strip, itm.css('.zg_hrsr_ladder a::text').extract())): int( re.sub('[ ,]', '', itm.css('.zg_hrsr_rank::text').re('([\d, ]+)')[0])) for itm in response.css('.zg_hrsr_item') } prim = response.css('#SalesRank::text, #SalesRank .value' '::text').re('#([\d ,]+) .*in (.+)\(') if prim: prim = {prim[1].strip(): int(re.sub('[ ,]', '', prim[0]))} ranks.update(prim) ranks = [{'category': k, 'rank': v} for k, v in ranks.iteritems()] cond_set_value(product, 'category', ranks) # parse department department = amazon_parse_department(ranks) if department is None: product['department'] = None else: product['department'], product['bestseller_rank'] \ = department.items()[0] def _populate_from_html(self, response, product): cond_set(product, 'brand', response.css('#brand ::text').extract()) self._get_price(response, product) brand_name = is_empty( response.xpath('//a[@id="brand"]/text()').extract()) cond_set(product, 'brand', brand_name) av = AmazonVariants() av.setupSC(response) product['variants'] = av._variants() brand_logo = is_empty( response.xpath('//a[@id="brand"]/@href').extract()) if brand_logo: brand = brand_logo.split('/')[1] cond_set_value(product, 'brand', brand) self.mtp_class.get_price_from_main_response(response, product) spans = response.xpath('//span[@class="a-text-bold"]') for span in spans: text = is_empty(span.xpath('text()').extract()) if text and 'Item model number:' in text: possible_model = span.xpath('../span/text()').extract() if len(possible_model) > 1: model = possible_model[1] cond_set_value(product, 'model', model) description = response.css('.productDescriptionWrapper').extract() if not description: iframe_content = re.findall(r'var iframeContent = "(.*)"', response.body) if iframe_content: res = iframe_content[0] f = re.findall( 'body%3E%0A%20%20(.*)' '%0A%20%20%3C%2Fbody%3E%0A%3C%2Fhtml%3E%0A', res) if f: desc = unquote(f[0]) description = [desc] if not description: description = response.xpath( '//div[@id="descriptionAndDetails"] |' '//div[@id="feature-bullets"] |' '//div[@id="ps-content"] |' '//div[@id="productDescription_feature_div"] |' '//div[contains(@class, "dv-simple-synopsis")] |' '//div[@class="bucket"]/div[@class="content"]').extract() cond_set( product, 'description', description, ) image = response.css( '#imgTagWrapperId > img ::attr(data-old-hires)').extract() if not image: j = re.findall(r"'colorImages': { 'initial': (.*)},", response.body) if not j: j = re.findall(r'colorImages = {"initial":(.*)}', response.body) if j: try: res = json.loads(j[0]) try: image = res[0]['large'] except: image = res[1]['large'] image = [image] except: pass if not image: image = response.xpath( '//div[@class="main-image-inner-wrapper"]/img/@src |' '//div[@id="coverArt_feature_div"]//img/@src |' '//div[@id="img-canvas"]/img/@src |' '//div[@class="dp-meta-icon-container"]/img/@src |' '//input[@id="mocaGlamorImageUrl"]/@value |' '//div[@class="egcProdImageContainer"]' '/img[@class="egcDesignPreviewBG"]/@src |' '//img[@id="main-image"]/@src').extract() if len(image) > 0 and image[0]: if product.get('image_url'): product['image_url'] = image[0] else: cond_set(product, 'image_url', image) title = response.css('#productTitle ::text').extract() if not title: title = response.xpath( '//div[@class="buying"]/h1/span[@id="btAsinTitle"]/text() |' '//div[@id="title_feature_div"]/h1/text() |' '//div[@id="title_row"]/span/h1/text() |' '//h1[@id="aiv-content-title"]/text() |' '//div[@id="item_name"]/text()').extract() if not title: parts = response.xpath( '//div[@id="mnbaProductTitleAndYear"]/span/text()').extract() if parts: title = '' for part in parts: title += part title = [title] cond_set(product, 'title', title) # Some data is in a list (ul element). model = None for li in response.css('td.bucket > .content > ul > li'): raw_keys = li.xpath('b/text()').extract() if not raw_keys: # This is something else, ignore. continue key = raw_keys[0].strip(' :').upper() if key == 'UPC': # Some products have several UPCs. raw_upc = li.xpath('text()').extract()[0] cond_set_value( product, 'upc', raw_upc.strip().replace(' ', ';'), ) elif key == 'ASIN' and model is None or key == 'ITEM MODEL NUMBER': model = li.xpath('text()').extract() cond_set(product, 'model', model, conv=string.strip) self.populate_bestseller_rank(product, response) def _populate_from_js(self, response, product): # Images are not always on the same spot... img_jsons = response.css( '#landingImage ::attr(data-a-dynamic-image)').extract() if img_jsons: img_data = json.loads(img_jsons[0]) cond_set_value(product, 'image_url', max(img_data.items(), key=lambda (_, size): size[0]), conv=lambda (url, _): url) def _get_rating_by_star_by_individual_request(self, response): product = response.meta['product'] mkt_place_link = response.meta.get("mkt_place_link") current_star = response.meta['_current_star'] current_star_int = [ i + 1 for i, _star in enumerate(self.buyer_reviews_stars) if _star == current_star ][0] br = product.get('buyer_reviews') if br: rating_by_star = br.get('rating_by_star') else: if mkt_place_link: return self.mkt_request(mkt_place_link, {"product": product}) return product if not rating_by_star: rating_by_star = {} num_of_reviews_for_star = re.search( r'Showing .+? of ([\d,\.]+) reviews', response.body) if num_of_reviews_for_star: num_of_reviews_for_star = num_of_reviews_for_star.group(1) num_of_reviews_for_star = num_of_reviews_for_star\ .replace(',', '').replace('.', '') rating_by_star[str(current_star_int)] \ = int(num_of_reviews_for_star) if not str(current_star_int) in rating_by_star.keys(): rating_by_star[str(current_star_int)] = 0 product['buyer_reviews']['rating_by_star'] = rating_by_star if len(product['buyer_reviews']['rating_by_star']) >= 5: product['buyer_reviews']['num_of_reviews'] \ = int(product['buyer_reviews']['num_of_reviews']) product['buyer_reviews']['average_rating'] \ = float(product['buyer_reviews']['average_rating']) # ok we collected all marks for all stars - can return the product product['buyer_reviews'] = BuyerReviews(**product['buyer_reviews']) if mkt_place_link: return self.mkt_request(mkt_place_link, {"product": product}) return product def _get_asin_from_url(self, url): match = re.search(r'/([A-Z0-9]{4,15})/', url) if match: return match.group(1) def _create_post_requests(self, response, asin): url = ('http://www.amazon.com/ss/customer-reviews/ajax/reviews/get/' 'ref=cm_cr_pr_viewopt_sr') meta = response.meta meta['_current_star'] = {} for star in self.buyer_reviews_stars: args = { 'asin': asin, 'filterByStar': star, 'filterByKeyword': '', 'formatType': 'all_formats', 'pageNumber': '1', 'pageSize': '10', 'sortBy': 'helpful', 'reftag': 'cm_cr_pr_viewopt_sr', 'reviewerType': 'all_reviews', 'scope': 'reviewsAjax0', } meta['_current_star'] = star yield FormRequest( url=url, formdata=args, meta=meta, callback=self._get_rating_by_star_by_individual_request, dont_filter=True) def get_buyer_reviews_from_2nd_page(self, response): if self._has_captcha(response): return self._handle_captcha(response, self.get_buyer_reviews_from_2nd_page) product = response.meta["product"] buyer_reviews = {} product["buyer_reviews"] = {} buyer_reviews["num_of_reviews"] = is_empty( response.xpath( '//span[contains(@class, "totalReviewCount")]/text()').extract( ), '').replace(",", "") if not buyer_reviews['num_of_reviews']: buyer_reviews['num_of_reviews'] = ZERO_REVIEWS_VALUE average = is_empty( response.xpath( '//div[contains(@class, "averageStarRatingNumerical")]//span/text()' ).extract(), "") buyer_reviews["average_rating"] = \ average.replace('out of 5 stars', '') buyer_reviews["rating_by_star"] = {} buyer_reviews = self.get_rating_by_star(response, buyer_reviews)[0] #print('*' * 20, 'parsing buyer reviews from', response.url) if not buyer_reviews.get('rating_by_star'): response.meta['product']['buyer_reviews'] = buyer_reviews # if still no rating_by_star (probably the rating is percent-based) return self._create_post_requests( response, self._get_asin_from_url(response.url)) #return product["buyer_reviews"] = BuyerReviews(**buyer_reviews) meta = {"product": product} mkt_place_link = response.meta.get("mkt_place_link", None) if mkt_place_link: return Request(url=mkt_place_link, callback=self.parse_marketplace, meta=meta, dont_filter=True) return product def _build_buyer_reviews(self, response): buyer_reviews = {} total = response.xpath('string(//*[@id="summaryStars"])').re( FLOATING_POINT_RGEX) if not total: total = response.xpath( 'string(//div[@id="acr"]/div[@class="txtsmall"]' '/div[contains(@class, "acrCount")])').re(FLOATING_POINT_RGEX) if not total: return ZERO_REVIEWS_VALUE buyer_reviews['num_of_reviews'] = int(total[0].replace(',', '')) average = response.xpath('//*[@id="summaryStars"]/a/@title') if not average: average = response.xpath( '//div[@id="acr"]/div[@class="txtsmall"]' '/div[contains(@class, "acrRating")]/text()') average = average.extract()[0].replace('out of 5 stars', '') buyer_reviews['average_rating'] = float(average) buyer_reviews['rating_by_star'] = {} buyer_reviews, table = self.get_rating_by_star(response, buyer_reviews) if not buyer_reviews.get('rating_by_star'): # scrape new buyer reviews request (that will lead to a new page) buyer_rev_link = is_empty(response.xpath( '//div[@id="revSum"]//a[contains(text(), "See all")' \ ' or contains(text(), "See the customer review")' \ ' or contains(text(), "See both customer reviews")]/@href' ).extract()) buyer_rev_req = Request( url=buyer_rev_link, callback=self.get_buyer_reviews_from_2nd_page) # now we can safely return Request # because it'll be re-crawled in the `parse_product` method return buyer_rev_req return BuyerReviews(**buyer_reviews) def get_rating_by_star(self, response, buyer_reviews): table = response.xpath('//table[@id="histogramTable"]' '/tr[@class="a-histogram-row"]') if table: for tr in table: #td[last()]//text()').re('\d+') rating = is_empty( tr.xpath('string(.//td[1])').re(FLOATING_POINT_RGEX)) number = is_empty( tr.xpath('string(.//td[last()])').re(FLOATING_POINT_RGEX)) is_perc = is_empty(tr.xpath('string(.//td[last()])').extract()) if "%" in is_perc: break if number: buyer_reviews['rating_by_star'][rating] = int( number.replace(',', '')) else: table = response.xpath( '//div[@id="revH"]/div/div[contains(@class, "fl")]') for div in table: rating = div.xpath( 'string(.//div[contains(@class, "histoRating")])').re( FLOATING_POINT_RGEX)[0] number = div.xpath( 'string(.//div[contains(@class, "histoCount")])').re( FLOATING_POINT_RGEX)[0] buyer_reviews['rating_by_star'][rating] = int( number.replace(',', '')) return buyer_reviews, table def _scrape_total_matches(self, response): # Where this value appears is a little weird and changes a bit so we # need two alternatives to capture it consistently. if response.css('#noResultsTitle'): return 0 # Every result I saw is shown with this format # 1-16 of 424,831 results for # 2 results for values = response.css('#s-result-count ::text').re( '([0-9,]+)\s[Rr]esults for') if not values: # The first possible place is where it normally is in a fully # rendered page. values = response.css('#resultCount > span ::text').re( '\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults') if not values: # Otherwise, it appears within a comment. values = response.css('#result-count-only-next').xpath( 'comment()').re('\s+of\s+(\d+(,\d\d\d)*)\s+[Rr]esults\s+') if values: total_matches = int(values[0].replace(',', '')) else: if not self.is_nothing_found(response): self.log("Failed to parse total number of matches for: %s" % response.url, level=ERROR) total_matches = None return total_matches def _scrape_results_per_page(self, response): num = response.xpath('//*[@id="s-result-count"]/text()').re( '1-(\d+) of') if num: return int(num[0]) else: num = response.xpath('//*[@id="s-result-count"]/text()').re( '(\d+) results') if num: return int(num[0]) return None def _scrape_product_links(self, response): lis = response.xpath("//div[@id='resultsCol']//ul//li |" "//div[@id='mainResults']//ul//li" "[contains(@id, 'result')] |" "//div[@id='atfResults']//ul//li" "[contains(@id, 'result')] |" "//div[@id='mainResults']//div" "[contains(@id, 'result')]") links = [] last_idx = -1 for li in lis: try: is_prime = li.xpath( "*/descendant::i[contains(concat(' ',@class,' '),' a-icon-prime ')] |" ".//span[contains(@class, 'sprPrime')]") is_prime_pantry = li.xpath( "*/descendant::i[contains(concat(' ',@class,' '),' a-icon-prime-pantry ')]" ) data_asin = li.xpath('@id').extract()[0] idx = int(re.findall(r'\d+', data_asin)[0]) if idx > last_idx: link = li.xpath( ".//a[contains(@class,'s-access-detail-page')]/@href |" ".//h3[@class='newaps']/a/@href").extract()[0] if 'slredirect' in link: link = urlparse.urljoin('http://amazon.com/', link) links.append((link, is_prime, is_prime_pantry)) else: break last_idx = idx except IndexError: continue if len(links) < 1: self.log("Found no product links.", WARNING) for link, is_prime, is_prime_pantry in links: prime = None if is_prime: prime = 'Prime' if is_prime_pantry: prime = 'PrimePantry' yield link, SiteProductItem(prime=prime) def _scrape_next_results_page_link(self, response): next_pages = response.css('#pagnNextLink ::attr(href)').extract() next_page_url = None if len(next_pages) == 1: next_page_url = next_pages[0] elif len(next_pages) > 1: self.log("Found more than one 'next page' link.", ERROR) return next_page_url def _search_page_error(self, response): body = response.body_as_unicode() return "Your search" in body \ and "did not match any products." in body # Captcha handling functions. def _has_captcha(self, response): return '.images-amazon.com/captcha/' in response.body_as_unicode() def _solve_captcha(self, response): forms = response.xpath('//form') assert len(forms) == 1, "More than one form found." captcha_img = forms[0].xpath( '//img[contains(@src, "/captcha/")]/@src').extract()[0] self.log("Extracted capcha url: %s" % captcha_img, level=DEBUG) return self._cbw.solve_captcha(captcha_img) def _handle_captcha(self, response, callback): # FIXME This is untested and wrong. captcha_solve_try = response.meta.get('captcha_solve_try', 0) url = response.url self.log("Captcha challenge for %s (try %d)." % (url, captcha_solve_try), level=INFO) captcha = self._solve_captcha(response) if captcha is None: self.log("Failed to guess captcha for '%s' (try: %d)." % (url, captcha_solve_try), level=ERROR) result = None else: self.log("On try %d, submitting captcha '%s' for '%s'." % (captcha_solve_try, captcha, url), level=INFO) meta = response.meta.copy() meta['captcha_solve_try'] = captcha_solve_try + 1 result = FormRequest.from_response( response, formname='', formdata={'field-keywords': captcha}, callback=callback, dont_filter=True, meta=meta) return result def _parse_single_product(self, response): return self.parse_product(response) def parse_marketplace(self, response): response.meta["called_class"] = self response.meta["next_req"] = None return self.mtp_class.parse_marketplace(response) def exit_point(self, product, next_req): if next_req: next_req.replace(meta={"product": product}) return next_req return product def is_nothing_found(self, response): txt = response.xpath('//h1[@id="noResultsTitle"]/text()').extract() txt = ''.join(txt) return 'did not match any products' in txt def mkt_request(self, link, meta): return Request(url=link, callback=self.parse_marketplace, meta=meta, dont_filter=True)