def parse(self, response): products = response.xpath( "//div[@class='product-list']/div/div[@class='product-details-container']" ) for prod in products: item = Product() item['Name'] = prod.xpath( ".//h4/span[@itemprop='name']/text()").extract_first().strip() item['original_url'] = response.urljoin( prod.xpath("a/@href").extract_first()) item['reg_price'] = re.sub( '[^\d\.]', '', prod.xpath( ".//span[@itemprop='offers']/s/span[@itemprop='price']/text()" ).extract_first().strip()) item['sale_price'] = re.sub( '[^\d\.]', '', prod.xpath( ".//span[@itemprop='offers']/span[@itemprop='price']/text()" ).extract_first().strip()) item['website_id'] = 23 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)
def parse(self, response): products = response.xpath( '//div[@class="product-list"]/ul/li//div[@class="info clearfix"]/a[contains(@class, "url")]' ) if len(products) == 0: return self.baseindex = self.baseindex + len(products) for prod in products: item = Product() item['Name'] = prod.xpath( 'div[@class="title"]/text()').extract_first().strip() item['brand'] = prod.xpath( 'div[@class="brand"]/text()').extract_first().strip() item['original_url'] = response.urljoin( prod.xpath('@href').extract_first()) item['reg_price'] = re.sub( '[^\d\.\,]', '', prod.xpath('.//span[@class="retail-price"]/text()'). extract_first()).strip() item['sale_price'] = re.sub( '[^\d\.\,]', '', prod.xpath( './/span[@class="sale-price"]/span[@class="sale-price-low"]/text()' ).extract_first()).strip() item['website_id'] = 17 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break # return yield Request(self.nextpage_url % self.baseindex, callback=self.parse)
def parse(self, response): if self.usd_set == 0: self.usd_set = 1 yield Request(self.setcurrency_url, callback=self.parse) return elif self.page_num == 0: self.page_num = 1 yield Request(self.nextpage_url % self.page_num, callback=self.parse) return products = response.xpath('//div[@id="products"]//div[@class="item first"]/div[@class="details"]/div[@class="details-content"]') # print len(products) for prod in products: item = Product() item['Name'] = prod.xpath('h4/a/text()').extract_first().strip() item['original_url'] = response.urljoin(prod.xpath('h4/a/@href').extract_first()) item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('ul[@class="price"]/li[not(@class)]/text()').extract_first()).strip() item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('ul[@class="price"]/li[@class="now"]/text()').extract_first()).strip() item['website_id'] = 14 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break # return if len(products) > 0: self.page_num = self.page_num + 1 yield Request(self.nextpage_url % self.page_num, callback=self.parse)
def parse(self, response): # e.g. http://www.swarovski.com/Web_US/en/json/json-result?SearchParameter=%26%40QueryTerm%3D*%26CategoryUUIDLevelX%3DkTUKaSUCyn4AAAEnV9lToUKM%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%3DInYKaVgfvWsAAAFaO6s2M2Wp%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%252FInYKaVgfvWsAAAFaO6s2M2Wp%3DTxcKaVgfw6MAAAFaOqs2M2Wp%26%40Sort.FFSort%3D0%26%40Page%3D2&PageSize=36&View=M page_num = 1 while 1: ajax_url = 'http://www.swarovski.com/Web_US/en/json/json-result?SearchParameter=%26%40QueryTerm%3D*%26CategoryUUIDLevelX%3DkTUKaSUCyn4AAAEnV9lToUKM%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%3DInYKaVgfvWsAAAFaO6s2M2Wp%26CategoryUUIDLevelX%252FkTUKaSUCyn4AAAEnV9lToUKM%252FInYKaVgfvWsAAAFaO6s2M2Wp%3DTxcKaVgfw6MAAAFaOqs2M2Wp%26%40Sort.FFSort%3D0%26%40Page%3D' + str( page_num) + '&PageSize=36&View=M' products = requests.get(ajax_url, headers={ 'X-Requested-With': 'XMLHttpRequest' }).json()['SearchResult']['Products'] for prod in products: item = Product() item['Name'] = prod['Name'] item['original_url'] = prod['DetailPage'] item['reg_price'] = re.sub('[^\d\.]', '', prod['OldPrice']) item['sale_price'] = re.sub('[^\d\.]', '', prod['Price']) item['website_id'] = 18 item['category_id'] = 4 item['discount'] = re.sub('[^\d]', '', prod['PricePercent']) yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break if len(products) < 36: break page_num = page_num + 1
def parse(self, response): products = json.loads(re.search('\"GRID_DATA\"\:(.*)\,[\s]*\"navigation', response.xpath("//script[contains(text(), 'GRID_DATA')]/text()").extract_first().encode("utf-8"), re.M|re.S|re.I).group(1) + '}')["_embedded"]["results"][0]["products"] # print len(products) offset = 0 while 1: for prod in products: item = Product() item['Name'] = prod["content"]['shortName'] item['original_url'] = prod['materials'][0]["_links"]["web:locale"]["href"] item['reg_price'] = re.sub('[^\d\.]', '', str(prod['priceRange']["msrp"]["min"])) item['sale_price'] = re.sub('[^\d\.]', '', str(prod['priceRange']["base"]["min"])) item['website_id'] = 22 item['category_id'] = 2 item['description'] = prod["content"]['categoryName'] item['original_image_url'] = ["http://underarmour.scene7.com/is/image/Underarmour/" + prod['materials'][0]["assets"][0]["image"] + "?template=v65GridLarge&$size=599,735&$wid=281&$hei=345&$extend=0,220,0,0"] item['image_urls'] = item['original_image_url'] yield item if len(products) < 60: break offset = offset + len(products) products = json.loads(re.search('\)\]\}\'\,(.*)\,[\s]*\"navigation', requests.get("https://www.underarmour.com/en-us/api/json-grid/outlet/womens/tops/g/6cl?s=&q=&p=&offset=%d&limit=60&stackId=other_grid_header&stackIdx=0&t[IsNewLoadMoreGrid]=0" % offset, headers={'X-Requested-With': "XMLHttpRequest"}).text.encode("utf-8"), re.M|re.S|re.I).group(1) + '}')["_embedded"]["results"][0]["products"] if len(products) == 0: break
def parse(self, response): sel = Selector(response) # f=open("page_source.html",'w+b') # f.write(response.body) all_divs = sel.xpath('//div[@class="dom-category-browse"]/div[2]/div') print len(all_divs) for a in all_divs: Name = a.xpath( 'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/a/h3/span/text()' ).extract() description = "" reg_price = a.xpath( 'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/p[@class="c-product-tile__price c-product-tile__price--regular"]/span/span[@class="c-product-meta__original-price"]/text()' ).extract() sale_price = a.xpath( 'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/p[@class="c-product-tile__price c-product-tile__price--regular"]/span/span[@class="c-product-meta__current-price c-product-meta__current-price--sale"]/text()' ).extract() website_id = 5 brand = "Anthropologie" original_url = a.xpath( 'span[@itemprop="product"]/div[@class="c-product-tile-details c-product-tile-details--regular"]/a/@href' ).extract() original_url = "https://www.anthropologie.com" + "".join( original_url).strip() original_image_url = "" category_id = 2 original_image_url = "".join( a.xpath( 'span[@itemprop="product"]/div[@class="c-product-tile-controls__link-wrap js-product-tile-controls__link-wrap"]/a/img/@src' ).extract()).strip() if 'https:' not in original_image_url: original_image_url = 'https:' + original_image_url item = Product() item['Name'] = "".join(Name).strip() item['reg_price'] = "".join(reg_price).strip().replace('$', '').strip() item['sale_price'] = "".join(sale_price).strip().replace( '$', '').strip() item['brand'] = "".join(brand).strip() item['original_url'] = original_url item['website_id'] = website_id item['category_id'] = category_id item['original_image_url'] = [original_image_url] yield Request(original_url, meta={'item': item}, callback=self.each_detail) # break next_page = sel.xpath('//a[@aria-label="next page"]/@href').extract() if len(next_page) > 0: yield Request("https://www.anthropologie.com" + next_page[0], callback=self.parse)
def parse(self, response): products = response.xpath('//div[@class="product-tile"]') for prod in products: item = Product() item['Name'] = prod.xpath('div[@class="product-name"]/h2/a[@class="name-link"]/text()').extract_first().strip() item['original_url'] = response.urljoin(prod.xpath('div[@class="product-name"]/h2/a[@class="name-link"]/@href').extract_first()) item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]//span[@class="price-standard"]/text()').extract_first().strip()) item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]//span[@class="price-sales"]/text()').extract_first().strip()) item['website_id'] = 15 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break if len(products) > 0: self.start_num = self.start_num + len(products) yield Request(self.nextpage_url % self.start_num, callback=self.parse)
def parse(self, response): products = requests.get(self.ajax_url, headers={ 'X-Requested-With': 'XMLHttpRequest' }).json()['Products'] for prod in products: item = Product() item['Name'] = prod['ModelName'] item['original_url'] = prod['ProductUrl'] item['reg_price'] = prod['MaxRegularPrice'] item['sale_price'] = prod['MinSalePrice'] item['website_id'] = 12 item['category_id'] = 2 item['original_image_url'] = [prod['ProductImageUrl']] item['image_urls'] = item['original_image_url'] yield item
def parse(self, response): sel = Selector(response) for prod in sel.xpath( '//div[@id="mainResults"]/ul/li[@class="item thumbnail-item"]' ): item = Product() item['Name'] = prod.xpath( 'ul[@class="feature-list"]/li[@class="fl-item title"]/a/text()' ).extract_first().strip() item['original_url'] = prod.xpath( 'ul[@class="feature-list"]/li[@class="fl-item title"]/a/@href' ).extract_first().strip() item['reg_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'ul[@class="feature-list"]/li[@class="fl-item price"]/a/span[@class="plp_product__strikeoutprice"]/b/text()' ).extract_first()).strip() item['sale_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'ul[@class="feature-list"]/li[@class="fl-item price"]/a/span[@class="fontSale plp_product_price"]/b/text()' ).extract_first()).strip() item['website_id'] = 9 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break # return try: nextpage_url = response.urljoin( sel.xpath( '//ul[@class="no-bullet paging"]/li[@class="next"]/a/@href' ).extract_first()).strip() if (nextpage_url is None) or (nextpage_url == ''): return yield Request(nextpage_url, callback=self.parse) except: pass
def get_products(self, response): products = json.loads(response.text)['response']['searchResults'] self.products_num = len(products) for prod in products: item = Product() item['Name'] = prod['productName'] item['original_url'] = prod['url'] item['reg_price'] = re.sub('[^\d\.]', '', prod['listPrice']) item['sale_price'] = re.sub('[^\d\.]', '', prod['minPrice']) item['website_id'] = 19 item['category_id'] = 3 item['description'] = prod['productDescription'] item['original_image_url'] = [ prod['defaultImage']['productDetailMain'] ] item['image_urls'] = item['original_image_url'] yield item
def parse(self, response): products = response.xpath( '//ul[@id="search-result-items"]/li/div[@class="product-tile"]') for prod in products: item = Product() item['Name'] = prod.xpath( 'div[@class="product-caption"]/div[@class="product-name"]/h2/a/text()' ).extract_first().strip() item['original_url'] = prod.xpath( 'div[@class="product-caption"]/div[@class="product-name"]/h2/a/@href' ).extract_first().strip() try: item['reg_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'div[@class="product-caption"]/div[@class="product-pricing"]//span[@title="Regular Price"]/text()' ).extract_first().strip()) except: item['reg_price'] = '0.0' try: item['sale_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'div[@class="product-caption"]/div[@class="product-pricing"]//span[@title="Sale Price"]/text()' ).extract_first().strip()) except: item['sale_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'div[@class="product-caption"]/div[@class="product-pricing"]/span[@class="product-sales-price"]/text()' ).extract_first().strip().split('-')[0]) item['website_id'] = 13 item['category_id'] = 2 item['original_image_url'] = [ prod.xpath( 'div[@class="product-image"]/a/img[@class="product-image"]/@src' ).extract_first() ] item['image_urls'] = item['original_image_url'] yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)
def parse(self, response): products = response.xpath('//ul/li/div[@class="product-tile"]') # print len(products) for prod in products: item = Product() item['Name'] = prod.xpath('h6[@class="product-name"]/a[@class="name-link"]/text()').extract_first().strip() item['original_url'] = prod.xpath('h6[@class="product-name"]/a[@class="name-link"]/@href').extract_first().strip() try: item['reg_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]/span[@title="Regular Price"]/text()').extract_first().strip()) except: item['reg_price'] = '0.0' try: item['sale_price'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-pricing"]/span[@title="Sale Price"]/text()').extract_first().strip()) except: item['sale_price'] = '0.0' item['website_id'] = 16 item['category_id'] = 2 # item['discount'] = re.sub('[^\d\.]', '', prod.xpath('div[@class="product-promo promotion"]/span[@class="promotional-message PRODUCT"]/text()').extract_first().strip()) yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail)
def parse(self, response): products = response.xpath( '//div[@id="product-grid"]//div[@class="product-tile"]//div[contains(@class, "innercard col")]' ) if len(products) == 0: return self.page_num = self.page_num + len(products) for prod in products: item = Product() item['Name'] = prod.xpath( 'div/div[@class="product-info-inner-content clearfix with-badges"]/a/@data-productname' ).extract_first().strip() item['original_url'] = prod.xpath( 'div/div[@class="product-info-inner-content clearfix with-badges"]/a/@href' ).extract_first().strip() item['reg_price'] = re.sub( '[^\d\.]', '', prod.xpath( './/div[@class="price"]/span[@class="strike"]/span[@class="baseprice"]/text()' ).extract_first()).strip() item['sale_price'] = re.sub( '[^\d\.]', '', prod.xpath( './/div[@class="price"]/span[@class="salesprice discount-price"]/text()' ).extract_first()).strip() item['website_id'] = 11 item['category_id'] = 2 item['discount'] = re.sub( '[^\d\.]', '', prod.xpath( 'div[@class="badge sale"]/span[@class="badge-text"]/text()' ).extract_first()).strip() yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break # return yield Request(self.nextpage_url % self.page_num, callback=self.parse)
def parse(self, response): sel = Selector(response) for prod in sel.xpath( '//div[@class="productlist"]/div[@class="productrows"]//div[contains(@class, "prodgrid")]' ): item = Product() item['Name'] = prod.xpath( 'span[@class="details"]/a/text()').extract_first().strip() item['original_url'] = response.urljoin( prod.xpath( 'span[@class="details"]/a/@href').extract_first()).strip() item['reg_price'] = re.sub( '[^\d\.]', '', prod.xpath( 'span[@class="cart"]/span[@class="wasPrice price"]/strike/text()' ).extract_first()).strip() item['sale_price'] = re.sub( '[^\d\.]', '', ''.join(prod.xpath( 'span[@class="cart"]/text()').extract())).strip() item['website_id'] = 8 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break # return try: nextpage_url = response.urljoin( sel.xpath( '//div[@class="pagination"]/div[@class="paginationlinks"]/ul/li[@class="next"]/a/@href' ).extract_first()).strip() if (nextpage_url is None) or (nextpage_url == ''): return yield Request(nextpage_url, callback=self.parse) except: pass
def get_products(self, response): products = json.loads(response.text)['Products']['List'] # print len(products) self.products_num = len(products) for prod in products: item = Product() item['Name'] = prod['Description'] item['original_url'] = response.urljoin(prod['ProductUrl']) item['reg_price'] = re.sub('[^\d\.]', '', prod['PriceDisplay']) item['sale_price'] = re.sub('[^\d\.]', '', prod['PriceSaleDisplay']) item['website_id'] = 20 item['category_id'] = 2 item['original_image_url'] = [prod['ImageMain']] item['image_urls'] = item['original_image_url'] item['discount'] = re.sub('[^\d\.]', '', prod['PercentageOff']) item['brand'] = prod['DesignerName'] yield Request(url=item['original_url'], callback=self.get_description, meta={'item': item})
def parse(self, response): products = response.xpath( "//div[@class='mainsite_record_listing']/div[@id='endeca_search_results']/ul/li[not(@class)]" ) # print len(products) if len(products) == 0: return for prod in products: item = Product() item['Name'] = ''.join( prod.xpath('a[not(@onmousedown)]/text()').extract()).strip() item['original_url'] = prod.xpath( 'a[not(@onmousedown)]/@href').extract_first() item['reg_price'] = re.sub( '[^\d\.\,]', '', prod.xpath("p[@class='product_price']/strike/b/text()"). extract_first()).strip() item['sale_price'] = re.sub( '[^\d\.\,]', '', prod.xpath("p[@class='product_price']/em/b/text()"). extract_first()).strip() item['website_id'] = 21 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) # break next_page = response.xpath( "//div[@class='endeca_pagination']/a[@class='next']/@href" ).extract_first() if next_page: yield Request(response.urljoin(next_page), callback=self.parse, dont_filter=True)
def parse(self, response): while 1: products = requests.get(self.ajax_url % self.page_num, headers={ 'X-Requested-With': 'XMLHttpRequest' }).json()['products'] if len(products) == 0: break for prod in products: item = Product() item['Name'] = prod['name'] item['original_url'] = 'http://www.dillards.com/p/' + prod[ 'nameForURL'] + '/' + prod['catentryId'] + '?di=' + prod[ 'fullImage'] + '&categoryId=410&facetCache=pageSize=96&beginIndex=%d&orderBy=1' % self.page_num item['reg_price'] = re.sub('[^\d\.]', '', prod['listMax']) item['sale_price'] = re.sub('[^\d\.]', '', prod['offerMin']) item['website_id'] = 10 item['category_id'] = 2 yield Request(item['original_url'], meta={'item': item}, callback=self.parse_detail) self.page_num = self.page_num + len(products)
def parse(self, response): sel = Selector(response) # f=open("page_source.html",'w+b') # f.write(response.body) all_divs = sel.xpath('//ul[@id="thumbnails"]/li') for a in all_divs: Name = a.xpath( 'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="shortDescription"]/a/text()' ).extract() description = "" reg_price = a.xpath( 'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="prices"]/span[@class="colorway-price"][1]/span[@class="first-range "]/text()' ).extract() sale_price = a.xpath( 'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="prices"]/span[@class="colorway-price"]/span[@class="first-range priceSale"]/text()' ).extract() website_id = 4 brand = "" original_url = a.xpath( 'div[@class="innerWrapper"]/div[@class="textWrapper"]/div[@class="shortDescription"]/a/@href' ).extract() original_url = "https://www.macys.com" + "".join( original_url).strip() original_image_url = "" category_id = 2 item = Product() item['Name'] = "".join(Name).strip() try: reg_price = "".join(reg_price).strip().replace( '$', '').strip().split(' ')[1].strip() except: pass try: sale_price = "".join(sale_price).strip().replace( '$', '').strip().split(' ')[1].strip() except: pass item['reg_price'] = reg_price item['sale_price'] = sale_price item['original_url'] = original_url item['website_id'] = website_id item['category_id'] = category_id yield Request(original_url, meta={'item': item}, callback=self.each_detail) #break try: current_page_no = response.url.split( 'Productsperpage/')[1].strip().split(',')[0].strip() max_page_no = response.body.split( 'totalPageCount:')[1].strip().split(',')[0].strip() if int("".join(current_page_no).strip()) < int( "".join(max_page_no).strip()): current_page_no = int("".join(current_page_no).strip()) + 1 temp_link = response.url.split('Productsperpage/')[0].strip( ) + "Productsperpage/" + str( current_page_no) + "," + response.url.split( 'Productsperpage/')[1].strip().split(',')[1].strip() yield Request(temp_link, callback=self.parse) except: #raise pass
def parse(self, response): sel = Selector(response) # f=open("page_source.html",'w+b') # f.write(response.body) all_divs = sel.xpath('//ul[@id="thumbnails"]/li') for a in all_divs: Name = a.xpath( 'div/div[@class="shortDescription newProdDesc"]/div[@id="prodName"]/a/text()' ).extract() description = "" reg_price = a.xpath( 'div/div[@class="prices"]/div[@class="priceSale colorwayBrowse"]/div/text()' ).extract() sale_price = a.xpath( 'div/div[@class="prices"]/div[@class="priceSale colorwayBrowse"]/div/span[@class="priceSale"]/text()' ).extract() website_id = 6 brand = a.xpath( 'div/div[@class="shortDescription newProdDesc"]/div[@id="brandName"]/a/text()' ).extract() original_url = a.xpath( 'div/div[@class="shortDescription newProdDesc"]/div[@id="prodName"]/a/@href' ).extract() original_url = "".join(original_url).strip() original_image_url = "" category_id = 2 try: reg_price = "".join(reg_price).strip().replace( '$', '').strip().split(' ')[1].strip() except: pass try: sale_price = "".join(sale_price).strip().replace( '$', '').strip().split(' ')[1].strip() except: pass item = Product() item['Name'] = "".join(Name).strip() item['reg_price'] = reg_price item['sale_price'] = sale_price item['brand'] = "".join(brand).strip() item['original_url'] = original_url item['website_id'] = website_id item['category_id'] = category_id yield Request(original_url, meta={'item': item}, callback=self.each_detail) # break try: next_page = sel.xpath('//link[@rel="canonical"]/@href').extract() current_page_no = sel.xpath( '//li[@class="currentPage displayNone"][1]/text()').extract() temp_page = sel.xpath( '//select[@id="paginationDdl"]/option/@value').extract() max_page_no = temp_page[len(temp_page) - 1] if int("".join(current_page_no).strip()) < int( "".join(max_page_no).strip()): current_page_no = int("".join(current_page_no).strip()) + 1 temp_link = "".join(next_page).strip().split('?id')[0].strip( ) + "/Pageindex/" + str(current_page_no) + "?id=" + "".join( next_page).strip().split('?id=')[1].strip() yield Request(temp_link, callback=self.parse) except: pass
def parse(self, response): print response.url sel = Selector(response) # f=open("page_source.html",'w+b') # f.write(response.body) all_divs = sel.xpath('//div[@id="atg_store_prodList"]/ul/li') for a in all_divs: Name = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-name"]/a/text()' ).extract() description = "" reg_price = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/span[@class="product-discounted-price"]/text()' ).extract() sale_price = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/span[@class="product-sales-price"]/text()' ).extract() website_id = 3 brand = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="brand"]/a/text()' ).extract() original_url = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-name"]/a/@href' ).extract() original_url = "http://www.barneyswarehouse.com" + "".join( original_url).strip() original_image_url = "" category_id = 2 discount = a.xpath( 'div[@class="product-tile "]/div[@class="wrap-desc"]/div[@class="product-pricing"]/div[@class="product-standard-price"]/text()' ).extract() discount = "".join(discount).strip().replace('Off', '').replace( '%', '').strip() item = Product() item['Name'] = "".join(Name).strip() item['reg_price'] = "".join(reg_price).strip().replace('$', '').strip() item['sale_price'] = "".join(sale_price).strip().replace( '$', '').strip() item['brand'] = "".join(brand).strip() item['original_url'] = original_url item['discount'] = discount item['website_id'] = website_id item['category_id'] = category_id print 'yield Request(original_url, meta={\'item\': item}, callback=self.each_detail)' yield Request(original_url, meta={'item': item}, callback=self.each_detail) # break current_page_no = sel.xpath( '//input[@id="currentPageNumber"][1]/@value').extract() max_page_no = sel.xpath( '//input[@id="currentPageNumber"][1]/@max').extract() try: if int("".join(current_page_no).strip()) < int( "".join(max_page_no).strip()): current_page_no = int("".join(current_page_no).strip()) + 1 temp_link = response.url.split( '&page=')[0].strip() + "&page=" + str(current_page_no) yield Request(temp_link, callback=self.parse) except: pass