def parse_pages(self, response): sel = Selector(response) category = response.meta['category'] product_type = response.meta['product_type'] gender = response.meta['gender'] category_url = response.meta['category_url'] item_link_lis = sel.xpath('//li[contains(@class, "hproduct product")]') if len(item_link_lis.extract())>0 : for item_link_li in item_link_lis: item_link_uri = item_link_li.xpath('./div/a/@href').extract()[0] url = self.shopbop_base_url + item_link_uri baseItem = BaseItem() baseItem['type'] = 'base' baseItem['category'] = category baseItem['product_type'] = product_type baseItem['url'] = url baseItem['gender'] = gender baseItem['brand'] = item_link_li.xpath('.//div[@class="brand"]/text()').extract()[0] baseItem['title'] = item_link_li.xpath('.//div[@class="title"]/text()').extract()[0] baseItem['cover'] = item_link_li.xpath('.//img/@src').extract()[0] baseItem['list_price'] = handle_price(item_link_li.xpath('.//span[@class="retail-price"]/text()').extract()[0]) baseItem['current_price'] = handle_price(item_link_li.xpath('.//span[@class="sale-price-low"]/text()').extract()[0]) yield Request(url, callback=self.parse_item, meta={'baseItem' : baseItem}) next_page_link = sel.xpath('//span[@data-at="nextPage"]/@data-next-link').extract() if len(next_page_link)>0 and (category_url[category] != next_page_link[0]): url = self.shopbop_base_url + next_page_link[0] yield Request(url, callback=self.parse_pages, meta={'category' : category, 'product_type' : product_type, 'gender' : gender, 'category_url' : category_url})
def parse_list(self, response): category = response.meta['category'] product_type = response.meta['product_type'] gender = response.meta['gender'] sel = Selector(response) products = sel.xpath('//div[@class="row js-productWrapper"]/div/div[@class="productListItem"]/div') for product in products: item = BaseItem() item['from_site'] = self.name item['type'] = 'base' item['category'] = category item['product_type'] = product_type item['gender'] = gender item['url'] = self.base_url + product.xpath('./div[@class="productImg"]/a/@href').extract()[0] item['cover'] = product.xpath('./div[@class="productImg"]/a/img/@src').extract()[0] item['show_product_id'] = product.xpath("./@data-productid").extract()[0] item['title'] = product.xpath('./div[@class="productInfo"]/div[@class="productName"]/a/text()').extract()[0] item['brand'] = product.xpath('./div[@class="productInfo"]/div[@class="productBrand"]/a/text()').extract()[0] yield Request(item['url'],callback=self.parse_item,meta={"item":item}) next_url = sel.xpath('//a[@class="js-filter-links iconCaret iconNext"]') if len(next_url) > 0: next_url = self.base_url + next_url.extract()[0] yield Request(next_url, callback=self.parse_list, meta={"category": category, "gender": gender, "product_type": product_type})
def parse(self, response): item = BaseItem() item['type'] = 'base' item['url'] = response.url item['from_site'] = 'asos' return AsosSpider().handle_parse_item(response, item)
def parse_list(self, response): # category = response.meta['category'] # sub_category = response.meta['sub_category'] sel = Selector(response) gender = response.meta['gender'] category = response.meta['category'] product_type = response.meta['product_type'] # breadcrumbs_lis = sel.xpath('//div[@class="breadcrumbs"]//li') # if len(breadcrumbs_lis) > 3: # if len(breadcrumbs_lis) < 6: # product_type = breadcrumbs_lis[2].xpath('./a/text()').extract()[0] # category = breadcrumbs_lis[-1].xpath('./text()').extract()[0] # else: # product_type = breadcrumbs_lis[2].xpath('./a/text()').extract()[0] +'-'+ breadcrumbs_lis[4].xpath('./a/text()').extract()[0] # category = breadcrumbs_lis[-1].xpath('./text()').extract()[0] # # else: # return if len(sel.xpath( ".//*[@id='divSearchResults']/div/div").extract()) > 0: uriDoms = sel.xpath(".//*[@id='divSearchResults']/div/div") for dom in uriDoms: item = BaseItem() item['from_site'] = 'lookfantastic' item['type'] = 'base' item['gender'] = gender item['product_type'] = product_type item['category'] = category # item['sub_category'] = sub_category item['cover'] = handle_no_http( dom.xpath(".//img/@src").extract()[0]) item['url'] = url = dom.xpath( ".//img/parent::*/@href").extract()[0] yield Request(url, callback=self.parse_item, meta={"item": item}) # # currentPage = re.findall(r'pageNumber=(\d+)', response.url) currentPageArr = re.split(r'(\&pageNumber=)', response.url) currentPage = 1 if len(currentPageArr) < 3 else int( currentPageArr[2]) pagesCount = int( sel.xpath( ".//div[@class='pagination_pageNumbers']/a[last()]/text()" ).extract()[0]) if currentPage < pagesCount: url = currentPageArr[0] + '&pageNumber=' + str(currentPage + 1) yield Request(url, callback=self.parse_list, meta={ 'gender': gender, 'product_type': product_type, 'category': category })
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'lastcall' item['url'] = response.url return LastcallSpider().handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'nordstrom' item['url'] = response.url return NordstromSpider().handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'forzieri' item['url'] = response.url return self.handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'luisaviaroma' item['url'] = response.url return LuisaviaromaSpider().handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'joesnewbalanceoutlet' item['url'] = response.url return self.handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'kipling' item['url'] = response.url return KiplingSpider().handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'farfetch' item['url'] = response.url return FarfetchSpider().handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'netaporter' item['url'] = response.url return NetAPorterSpider().handle_parse_item(response, item)
def parse(self, response): url = response.url response_body_str = response.body response_body_str_replace_false = re.sub(r'false', 'False', response_body_str) response_body_str_replace_true = re.sub( r'true', 'True', response_body_str_replace_false) response_body = eval(response_body_str_replace_true) results = response_body['responseData']['results'] if url in self.start_urls: metadata = results['metadata'] total_items = metadata['totalProductCount'] temp1 = total_items / 40 temp2 = total_items % 40 if temp2 == 0: total_pages = temp1 else: total_pages = temp1 + 1 page_num = 1 else: page_num = response.meta['page_num'] total_pages = response.meta['total_pages'] #yield Request(url, callback=self.parse_page, meta={'results': results}) #results = response.meta['results'] metadata = results['metadata'] filter_context = metadata['filterContext'] products = results['products'] product_type = self.product_id_type_category_map[filter_context][ 'product_type'] category = self.product_id_type_category_map[filter_context][ 'category'] for product in products: baseItem = BaseItem() baseItem['product_type'] = product_type baseItem['category'] = category baseItem['show_product_id'] = product['productId'] baseItem['list_price'] = product['price']['retail'] baseItem['current_price'] = product['price']['high'] baseItem['brand'] = product['brand'] baseItem['url'] = self.base_url + product['productDetailLink'] baseItem['cover'] = self.base_item_image_uri_suffix + \ product['colors'][0]['productImage'] yield Request(baseItem['url'], callback=self.parse_item, meta={'baseItem': baseItem}) if total_pages > page_num: page_num = page_num + 1 base_index = (page_num - 1) * 40 page_url = re.sub(r'baseIndex=\d+', ('baseIndex=' + str(base_index)), url) yield Request(page_url, callback=self.parse, meta={ "page_num": page_num, 'total_pages': total_pages })
def parse_list(self, response): category = response.meta['category'] gender = response.meta['gender'] product_type = response.meta['product_type'] categoryUrl = response.meta['categoryUrl'] sel = Selector(response) listDom = sel.xpath(".//div[@id='products']/div") if len(listDom.extract()) > 0: for dom in listDom: item = BaseItem() item['from_site'] = 'sierratradingpost' item['type'] = 'base' item['category'] = category item['product_type'] = product_type item['gender'] = gender item['title'] = dom.xpath("./span/@data-name").extract()[0] item['brand'] = dom.xpath("./span/@data-brand").extract()[0] item['show_product_id'] = dom.xpath( "./span/@data-baseid").extract()[0] item['current_price'] = re.sub( r'\s', '', dom.xpath(".//span[@class='ourPrice']/text()").extract() [0])[1:] retailPrice = dom.xpath( ".//span[@class='retailPrice']/text()").extract()[0] find = retailPrice.find('$') item['list_price'] = retailPrice[find + 1:] item["url"] = url = self.base_url + dom.xpath( "./div[@class='productTitle']/a/@href").extract()[0] yield Request(url, callback=self.parse_item, meta={"item": item}) if len(sel.xpath(".//span[@class='currentPage']/text()")) == 0: currntePage = 1 else: currntePage = sel.xpath( ".//span[@class='currentPage']/text()").extract()[0] countStr = sel.xpath( ".//span[@id='numberOfItems']/text()").extract()[0] countTotal = int(re.sub('[(),items]', '', countStr)) lastPage = countTotal / 24 + 1 if countTotal % 24 > 0 else countTotal / 24 if int(currntePage) < int(lastPage): list_more_url = categoryUrl + str(int(currntePage) + 1) + '/' yield Request(list_more_url, callback=self.parse_list, meta={ "categoryUrl": categoryUrl, "category": category, "product_type": product_type, "gender": gender })
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'tiffany' item['url'] = response.url ts = TiffanySpider() return ts.handle_parse_item(response, item)
def parse_list(self, response): sel = Selector(response) category = response.meta['category'] gender = response.meta['gender'] product_type = response.meta['product_type'] ''' #use for test url = 'http://www.rebeccaminkoff.com/pygar-top-black' #url = 'http://www.rebeccaminkoff.com/aria-sweater' #url = 'http://www.rebeccaminkoff.com/india-drawstring-holiday-black' #url = 'http://www.rebeccaminkoff.com/india-drawstring-holiday' item = BaseItem() item['from_site'] = 'rebeccaminkoff' item['url'] = url item['cover'] = 'cover' item['gender'] = gender item['product_type'] = product_type item['category'] = 'category' yield Request(url, callback = self.parse_item, meta = { "item":item } ) ''' if sel.xpath( '//div[contains(@class, "pages")]//ol//li//a[contains(@class, "next i-next")]' ): next_url = sel.xpath( '//div[contains(@class, "pages")]//ol//li//a[contains(@class, "next i-next")]/@href' ).extract()[0] #print next_url yield Request(next_url, callback=self.parse_list, meta={ "category": category, 'gender': gender, 'product_type': product_type }) listArr = sel.xpath( '//ul[contains(@class, "products-grid products-grid--max-3-col")]//li[contains(@class, "item")]' ) for lists in listArr: if lists.xpath('.//a//img/@data-src'): url = lists.xpath('.//a/@href').extract()[0] cover = lists.xpath('.//a//img/@data-src').extract()[0] item = BaseItem() item['from_site'] = 'rebeccaminkoff' item['url'] = url item['cover'] = cover item['gender'] = gender item['product_type'] = product_type item['category'] = category yield Request(url, callback=self.parse_item, meta={"item": item})
def parse_list(self, response): product_type = response.meta['product_type'] category = response.meta['category'] gender = response.meta['gender'] sel = Selector(response) lis = sel.xpath('//ul[@id="totproductsList"]/li') for li in lis: item = BaseItem() item['type'] = 'base' item['from_site'] = self.name item['product_type'] = product_type item['category'] = category item['gender'] = gender cover = li.xpath( './div[@class="pro_pic"]//img/@data-original').extract()[0] if not re.match(r'^http', cover): cover = 'http:' + cover item['cover'] = cover item['brand'] = li.xpath('./a[@class="tit"]/text()').extract()[0] item['title'] = li.xpath( './div[@class="info"]/a/text()').extract()[0].strip() if item['title'] == '': item['title'] = 'Product Title Temporarily Not Available' black_price = li.xpath( './div[@class="pro_price_black"]/text()').extract()[0].strip() m = re.match(r'^(\$[\d\.\,]+)\s*-\s*(\$[\d\.\,]+)', black_price) if m is not None: item['list_price'] = handle_price(m.group(2)) else: item['list_price'] = handle_price(black_price) red_price = li.xpath('./div[@class="pro_price_red"]/text()').re( r'[^\$]+(.+)') if len(red_price) > 0: red_price = red_price[0].strip() m = re.match(r'^(\$[\d\.\,]+)\s*-\s*(\$[\d\.\,]+)', red_price) if m is not None: item['current_price'] = handle_price(m.group(1)) else: item['current_price'] = handle_price(red_price) else: item['current_price'] = item['list_price'] url = li.xpath( './/script[@class="catEntryDisplayUrlScript"]/text()' ).re_first(r'categoryDisplayJS.setCatEntryDisplayURL\("(.+)"\);') item['url'] = url yield Request(url, callback=self.parse_item, meta={'item': item})
def parse_list(self, response): sel = Selector(response) gender = response.meta['gender'] product_type = response.meta['product_type'] category = response.meta['category'] for item_li in sel.xpath('//li[@class="item altview_item"]'): item = BaseItem() item['type'] = 'base' item['gender'] = gender item['product_type'] = product_type item['category'] = category item['from_site'] = self.name item['url'] = self.base_url + item_li.xpath('./a/@href').extract()[0] item['cover'] = item_li.xpath('./a/div[@class="image_wrap"]/img[1]/@src').extract()[0] item['brand'] = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="designer_brand"]/text()').extract()[0].strip() item['title'] = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="product_name"]/text()').extract()[0].strip() if len(item_li.xpath('./a/div[@class="info_wrap"]/div[@class="price_box sale"]'))>0: list_pirce = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="price_box sale"]/span[@class="price"]/text()').extract()[0].strip() current_price = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="price_box sale"]/span[@class="discount_price"]/text()').extract()[0].strip() elif len(item_li.xpath('./a/div[@class="info_wrap"]/div[@class="price_box"]/span[@class="price"]/text()')) > 0: list_pirce = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="price_box"]/span[@class="price"]/text()').extract()[0].strip() current_price = list_pirce elif len(item_li.xpath('./a/div[@class="info_wrap"]/div[@class="eagle"]/div[@class="prices sale"]'))>0: list_pirce = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="eagle"]/div[@class="prices sale"]/span[@class="prices__retail-strikethrough"]/text()').extract()[0].strip() current_price = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="eagle"]/div[@class="prices sale"]/span[@class="prices__markdown"]/text()').extract()[0].strip() else: list_pirce = item_li.xpath('./a/div[@class="info_wrap"]/div[@class="eagle"]/div[@class="prices"]/span/text()').extract()[0].strip() current_price = list_pirce if 'USD' in list_pirce: list_pirce = list_pirce.replace('USD', '') if 'USD' in current_price: current_price = current_price.replace('USD', '') url = item['url'] yield Request(url, callback=self.parse_item, meta={'item': item}) '''分页''' if 'pageNum' in str(response.url): current_page = re.search('pageNum=([\d]+)', str(response.url)).group(1) else: current_page = 1 total_goods = sel.xpath('.//li[@class="spacer"]//span/text()').extract()[0] if 'Items' in total_goods: total_goods = total_goods.replace('Items', '') last_page = int(total_goods) / 96 + 1 next_page = int(current_page) + 1 if int(current_page) < int(last_page): if 'pageNum' in str(response.url): next_url = re.sub('pageNum=[\d]+', 'pageNum=' + str(next_page), str(response.url)) else: if '?' not in response.url: next_url = str(response.url) + '?pageNum=' + str(next_page) else: next_url = str(response.url) + '&pageNum=' + str(next_page) yield Request(next_url, callback=self.parse_list, meta={'gender': gender, 'product_type': product_type, 'category': category}, cookies={'ckm-ctx-sf': '/'})
def parse_pages(self, response): income_url = response.url gender = 'unisex' if income_url == "http://www.drugstore.com/personal-care/mens/qxg286961-0" or income_url == "http://www.drugstore.com/vitamins/for-men/qxg180683-0": gender = 'men' elif income_url == "http://www.drugstore.com/vitamins/for-children/qxg180682-0" or income_url == "http://www.drugstore.com/medicine-and-health/childrens-healthcare/qxg180623-0": gender = "kid-unisex" elif income_url == "http://www.drugstore.com/vitamins/for-women/qxg180684-0": gender = "women" sel = Selector(response) product_type = response.meta['product_type'] category = response.meta['category'] item_list_links = sel.xpath('//div[contains(@class, "itemGrid")]') #item_list_links=sel.xpath('//div[@class="prodImg"]') #item_list_link 代表一个商品 if len(item_list_links) > 0: for item_list_link in item_list_links: cover_div = item_list_link.xpath( './div[@class="prodImg"]/a/img/@src').extract() if len(cover_div) > 0: baseItem = BaseItem() baseItem['gender'] = gender baseItem['product_type'] = product_type baseItem['category'] = category baseItem['cover'] = cover_div[0] brand = item_list_link.xpath( './div[@class="info"]//span[@class="name"]/text()') if len(brand) > 0: baseItem['brand'] = brand.re_first( r'([^-]+)-\s*$').strip() item_list_link_uri = item_list_link.xpath( './div[@class="prodImg"]/a/@href') baseItem[ 'url'] = self.base_url + item_list_link_uri.extract( )[0] #price_not_on_sale=item_list_link.xpath('./div[@class="pricing"]//span[@class="PlistPrice"]/text()') #if len(price_not_on_sale)>0: # baseItem['list_price']=price_not_on_sale.extract()[0] # baseItem['current_price']=baseItem['list_price'] #elif len(item_list_link.xpath('./div[@class="pricing"]//span[@class="PlistOfferPrice"]/text()'))>0: # baseItem['list_price']=item_list_link.xpath('./div[@class="pricing"]//span[@class="PListPriceStrikeOut"]/s/text()').extract()[0] # baseItem['current_price']=item_list_link.xpath('./div[@class="pricing"]//span[@class="PlistOfferPrice"]/text()').extract()[0] yield Request(baseItem['url'], callback=self.parse_item, meta={'baseItem': baseItem}) #获取下一页url next_page_url = sel.xpath('//a[@class="nextpage"]/@href') if len(next_page_url) > 0: next_page = self.base_url + next_page_url.extract()[0] yield Request(next_page, callback=self.parse_pages, meta={ 'product_type': product_type, 'category': category })
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'pharmacyonline' item['url'] = response.url ps = PharmacyonlineSpider() return ps.handle_parse_item(response, item)
def parse(self, response): item = BaseItem() item['type'] = 'base' item['from_site'] = 'lordandtaylor' item['url'] = response.url ils = LordandtaylorSpider() return ils.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'sierratradingpost' item['url'] = response.url ss = SierratradingpostSpider() return ss.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'allsole' item['url'] = response.url ass = AllsoleSpider() return ass.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'disneystore' item['url'] = response.url dss = DisneystoreSpider() return dss.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'sevenforallmankind' item['url'] = response.url sfas = AllmankindSpider() return sfas.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'levi' item['url'] = response.url ls = LeviSpider() return ls.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'saksfifthavenue' item['url'] = response.url # ss = SaksfifthavenueSpider() return self.handle_parse_item(response, item)
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'mybag' item['url'] = response.url ms = MybagSpider() return ms.handle_parse_item(response, item)
def parse_list_more(self, response): category = response.meta['category'] product_type = response.meta['product_type'] gender = response.meta['gender'] sel = Selector(response) if len(sel.xpath("//*[@id='replacement-products']/li").extract()) > 0: uriDoms = sel.xpath("//*[@id='replacement-products']/li") genderRule = { "infant": "baby", "newborn": "baby", "baby": "baby", "toddler": "toddler", "little girls": "girls", "little boys": "boys", "girls": "girls", "boys": "boys" } for dom in uriDoms: uri = dom.xpath('.//@data-product-url').extract()[0] item = BaseItem() item['from_site'] = 'levi' item['type'] = 'base' item['category'] = category item['product_type'] = product_type item['title'] = dom.xpath( ".//p[@class='name']/text()").extract()[0] if gender == 'kids': tmpTitle = re.findall(r"\w+", item['title']) tmpGender = tmpTitle[0].lower() if tmpGender in genderRule.keys(): gender = genderRule[tmpGender] else: tmpGender += ' ' + tmpTitle[1].lower() if tmpGender in genderRule.keys(): gender = genderRule[tmpGender] item['gender'] = gender item['colors'] = [ dom.xpath(".//p[@class='finish']/text()").extract()[0] ] item["show_product_id"] = dom.xpath( './/@data-product-id').extract()[0] item['cover'] = dom.xpath(".//img/@src").extract()[0] item["url"] = url = self.base_url + uri yield Request(url, callback=self.parse_item, meta={"item": item}) else: return
def parse(self, response): sel = Selector(response) item = BaseItem() item['type'] = 'base' item['from_site'] = 'ssense' item['url'] = response.url item['cover'] = '' ss = SsenseSpider() return ss.handle_parse_item(response, item)