def parse_price(self, response): csv_row = response.meta['row'] parent_asin = response.meta['parent_asin'] if 'a-size-medium a-color-price\\">' in response.body: ajax_price = response.body.split('a-size-medium a-color-price\\">')[-1].split('<\/')[0].replace('$','') ajax_price = re.findall(r'\d+',ajax_price)[0] csv_row[5] = converter.cleanup_price(ajax_price) child_asin = csv_row[1] csv_dict[parent_asin][csv_row[1]] = csv_row
def parse(self, response): # def parse_category(self, response): sel = Selector(response) product_name = sel.xpath( "//span[@id='productTitle']/text()").extract()[0] parent_asin = sel.xpath( "//div[@id='tell-a-friend']/@data-dest").extract()[0].split( 'parentASIN=')[-1].split('&')[0] try: price = sel.xpath("//span[@id='priceblock_ourprice']/text()" ).extract()[0].replace('$', '') price = converter.cleanup_price(price) except: price = '' try: brand = sel.xpath("//a[@id='brand']/text()").extract()[0] except: brand = sel.xpath("//a[@id='brand']/@href").extract()[0].split( "/")[1] parent_row = ['Product', parent_asin, brand, product_name, '', price] global csv_dict csv_dict[parent_asin] = {} csv_dict[parent_asin]['Parent_Row'] = parent_row # mywriter.writerow(row) size_script = sel.xpath( "//script[@language='JavaScript'][contains(text(),'window.isTwisterAUI = 1')]" ).extract()[0] color_script = sel.xpath( "//script[@type='text/javascript'][contains(text(),'customerImages')]" ).extract()[0] ''' Initializing Dictionaries for Variants(Asin, Variant Values), Pricing(Asin, Price) and Images(Asin, Images) ''' variant_dict, price_dict, image_dict = {}, {}, {} size_script = size_script.split( 'dimensionValuesDisplayData')[-1].split('"deviceType')[0] new_script = re.findall('"(.*?)]', size_script.split("hidePopover")[0]) for i in new_script: asin = i.split('[')[0].replace(':{"', '').replace('":', '') variants = i.split('["')[-1] variant_dict[asin] = variants color_script = color_script.split('data["colorImages"] =')[-1].split( 'data["heroImage"] = {};')[0].rsplit(';', 1)[0] color_script = demjson.decode(color_script) for key, value in variant_dict.iteritems(): try: color = value.split('"')[-2].split('"')[0] image_dict[color] = [] for images in color_script[color]: image_dict[color].append(images['large']) except: pass price_url = sel.xpath( "//script[contains(text(),'immutableURL')]/text()").extract( )[0].split('immutableURLPrefix":"')[-1].split('"')[0] price_url = 'http://www.amazon.com' + price_url + '&psc=1&isFlushing=2&dpEnvironment=softlines&mType=full' ''' To check if Swatches exist ''' swatches = response.xpath( "//div[@id='variation_style_name']//li[contains(@id,'style')]") if swatches: for swatch in swatches: swatch_price = swatch.xpath( ".//div[@class='twisterSlotDiv']//span[@class='a-size-mini']/text()" ).extract() if swatch_price: swatch_price = swatch_price[0].replace('$', '').strip() swatch_price = converter.cleanup_price(swatch_price) else: swatch_price = False swatch_asin = swatch.xpath("@data-dp-url").extract()[0].split( 'dp/')[-1].split('/')[0] price_dict[swatch_asin] = swatch_price self.variant_dict = variant_dict for asin, variants in variant_dict.iteritems(): row = [] color = variants.split('"')[-2] size = variants.split('"')[0] # # print url # if asin in price_dict: # variant_price = price_dict[asin] # if variant_price == False: # continue # else: # variant_price = price # row = ['SKU', asin, '', color, size, variant_price , '', ''] row = [ 'SKU', asin, '', color, size, '', '', '', ] for image in image_dict[color]: row.append(image) ''' # Generating Requests for Price Ajax Script ''' url = price_url + '&asinList=%s&id=%s' % (asin, asin) __price_request = Request(url, callback=self.parse_price) __price_request.meta['row'] = row __price_request.meta['parent_asin'] = parent_asin yield __price_request
def parse_category(self, response): sel = Selector(response) product_name = sel.xpath("//span[@id='productTitle']/text()").extract()[0] parent_asin = sel.xpath("//div[@id='tell-a-friend']/@data-dest").extract()[0].split('parentASIN=')[-1].split('&')[0] try: price = sel.xpath("//span[@id='priceblock_ourprice']/text()").extract()[0].replace('$','') price = converter.cleanup_price(price) except: price = '' try: brand = sel.xpath("//a[@id='brand']/text()").extract()[0] except: brand = sel.xpath("//a[@id='brand']/@href").extract()[0].split("/")[1] parent_row = ['Product', parent_asin, brand, product_name,'',price] global csv_dict csv_dict[parent_asin] = {} csv_dict[parent_asin]['Parent_Row'] = parent_row # mywriter.writerow(row) size_script = sel.xpath("//script[@language='JavaScript'][contains(text(),'window.isTwisterAUI = 1')]").extract()[0] color_script = sel.xpath("//script[@type='text/javascript'][contains(text(),'customerImages')]").extract()[0] ''' Initializing Dictionaries for Variants(Asin, Variant Values), Pricing(Asin, Price) and Images(Asin, Images) ''' variant_dict, price_dict, image_dict = {}, {}, {} size_script = size_script.split('dimensionValuesDisplayData')[-1].split('"deviceType')[0] new_script = re.findall('"(.*?)]',size_script.split("hidePopover")[0]) for i in new_script: asin = i.split('[')[0].replace(':{"','').replace('":','') variants = i.split('["')[-1] variant_dict[asin] = variants color_script = color_script.split('data["colorImages"] =')[-1].split('data["heroImage"] = {};')[0].rsplit(';',1)[0] color_script = demjson.decode(color_script) for key,value in variant_dict.iteritems(): try: color = value.split('"')[-2].split('"')[0] image_dict[color] = [] for images in color_script[color]: image_dict[color].append(images['large']) except: pass price_url = sel.xpath("//script[contains(text(),'immutableURL')]/text()").extract()[0].split('immutableURLPrefix":"')[-1].split('"')[0] price_url = 'http://www.amazon.com' + price_url + '&psc=1&isFlushing=2&dpEnvironment=softlines&mType=full' ''' To check if Swatches exist ''' swatches = response.xpath( "//div[@id='variation_style_name']//li[contains(@id,'style')]") if swatches: for swatch in swatches: swatch_price = swatch.xpath(".//div[@class='twisterSlotDiv']//span[@class='a-size-mini']/text()").extract() if swatch_price: swatch_price = swatch_price[0].replace('$','').strip() swatch_price = converter.cleanup_price(swatch_price) else: swatch_price = False swatch_asin = swatch.xpath("@data-dp-url").extract()[0].split('dp/')[-1].split('/')[0] price_dict[swatch_asin] = swatch_price self.variant_dict = variant_dict for asin, variants in variant_dict.iteritems(): row = [] color = variants.split('"')[-2] size = variants.split('"')[0] # # print url # if asin in price_dict: # variant_price = price_dict[asin] # if variant_price == False: # continue # else: # variant_price = price # row = ['SKU', asin, '', color, size, variant_price , '', ''] row = ['SKU', asin, '', color, size, '','' , '',] for image in image_dict[color]: row.append(image) ''' # Generating Requests for Price Ajax Script ''' url = price_url + '&asinList=%s&id=%s' %(asin,asin) __price_request = Request(url, callback = self.parse_price) __price_request.meta['row'] = row __price_request.meta['parent_asin'] = parent_asin yield __price_request