def parse_product(self, response): sel = Selector(response) metadata = { 'brand_id': self.spider_data['brand_id'], 'url': response.url, 'tags_mapping': {}, 'color': [], } #单品model model = None mt = re.search(r'.+/product/(\d+).*', response.url) if mt: model = mt.group(1) model = self.reformat(model) if model: metadata['model'] = model else: return #单品region region = None mt = re.search('.+com/(\w+)/.+', response.url) if mt: region = mt.group(1) #替换gb为uk if region == 'gb': region = 'uk' region = self.reformat(region) if region: metadata['region'] = region else: return #左上类型标签 type_nodes = sel.xpath('//ul[@class="breadcrumbs"]//li') category_index = 0 for node in type_nodes: type_node = node.xpath('./a') if not type_node: continue type_text = type_node.xpath('./text()').extract()[0] type_text = self.reformat(type_text) type_name = type_text.lower() if type_text and type_name: category_type = str.format('category-{0}', category_index) metadata['tags_mapping'][category_type] = [{ 'name': type_name, 'title': type_text }] category_index += 1 gender = common.guess_gender(type_name) if gender: metadata['gender'] = [gender] #价格标签 price_node = sel.xpath('//span[@id="text-price"]//span') if price_node: price = price_node.xpath('./text()').extract()[0] price = self.reformat(price) if price: metadata['price'] = price #单品名称 name_node = sel.xpath('//h1') if name_node: name = name_node.xpath('./text()').extract()[0] name = self.reformat(name) if name: metadata['name'] = name #详情标签 description_node = sel.xpath('//div[@class="description"]') if description_node: description_text_node = description_node.xpath('.//p[1]') if description_text_node: description = description_text_node.xpath( './text()').extract()[0] description = self.reformat(description) if description: metadata['description'] = description detailText_node = description_node.xpath('.//*[preceding::h2[2]]') if detailText_node: detail = ''.join(detailText_node.xpath('./text()').extract()) detail = self.reformat(detail) if detail: metadata['details'] = detail #颜色标签,获取各种颜色的图片 color_nodes = sel.xpath('//*[@id="options-articles"]//li') for node in color_nodes: color_node = node.xpath('.//span') if color_node: tmp = color_node.xpath('./text()').extract() if not tmp: continue color_text = self.reformat(tmp[0]) if color_text: metadata['color'] += [color_text] color_image_node = node.xpath('.//a') if color_image_node: color_image_href = color_image_node.xpath( './@href').extract()[0] color_image_href = re.sub(ur'\?.+', color_image_href, response.url) m = copy.deepcopy(metadata) yield Request(url=color_image_href, callback=self.parse_images, errback=self.onerr, meta={'userdata': m}, dont_filter=True) item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_nodes = sel.xpath( '//div[@class="product-detail-container"]//ul[@class="swatch-set clearfix"]/li/a[@href][@title]') for node in other_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description colors = self.fetch_color(response) if colors: metadata['color'] = colors image_urls = [] image_nodes = sel.xpath( '//div[@class="product-detail-container"]//ul[@class="variant-thumbnail-set"]/li/a/img[@src]') for node in image_nodes: try: url = node.xpath('./@src').extract()[0] url = self.process_href(url, response.url) if url: image_url = re.sub(ur'/\d+/\d+/', u'/2000/2000/', url) if image_url: image_urls += [image_url] except(TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url other_nodes = sel.xpath( '//div[@class="attributePanel"]//div[@class="palette"]/a[@href]') for node in other_nodes: m = copy.deepcopy(metadata) try: other_href = node.xpath('./@href').extract()[0] other_href = re.sub(ur'\s', '', other_href) other_href = self.process_href(other_href, response.url) except (TypeError, IndexError): continue yield Request(url=other_href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description colors = self.fetch_color(response) if colors: metadata['color'] = colors image_urls = [] image_nodes = sel.xpath( '//div[@id="mainPictureBlock"]//div[@id="productSheetSlideshow"]//li[not(@id)]/img[@data-src]' ) for node in image_nodes: try: image_src = node.xpath('./@data-src').extract()[0] # mt = re.search(ur'/([^/]+)\.\w+$', image_src) # if mt: # image_name = mt.group(1) # # image_src = str.format("{0}{1}/jcr:content/renditions/{2}_550x550.jpg", # self.spider_data['image_host'], image_src, image_name) # if image_src: # image_urls += [image_src] image_src = str.format("{0}{1}", self.spider_data['image_host'], image_src) if image_src: image_urls += [image_src] except (TypeError, IndexError): continue item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_other_procut(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail colors = self.fetch_color(response) if colors: metadata['color'] = colors # 这里会有比需要的图片多 image_fix_list = re.findall(r'"(\d{2}_[a-z])"', response.body) # 这里去掉一下没用的后缀 max_fix = '0' for fix in image_fix_list: if fix > max_fix: max_fix = fix[:2] def func(item): mt = re.search(str.format('{0}_[a-z]', max_fix), item) if mt: return True else: return False image_fix_list = filter(func, image_fix_list) # 用页面中图片的地址取的他们图片服务器的地址 # 顺便用它里边已经写好的单品的id和颜色的id image_urls = None try: image_node = sel.xpath( '//aside[@class="itemSidebar"]//div[@class="colors"]/div[@class="colorSizeContent colorSlider"]/div[@class="colorMask"]//img[@src]') if image_node: image_urls = [ re.sub('\d{2}_[a-z]', val, src) for val in image_fix_list for src in image_node.xpath('./@src').extract() ] except(TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): """ 解析单品页面 """ metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name # colors = self.fetch_color(response) # if colors: # metadata['color'] = colors ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail # if not metadata.get('model'): # model_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h1') # if model_node: # try: # model = model_node.xpath('./text()').extract()[0] # model = self.reformat(model) # if model: # metadata['model'] = model.upper() # metadata['name'] = model.lower() # except(TypeError, IndexError): # pass # # if not metadata.get('model'): # return # # if not metadata.get('price'): # price_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h2') # if price_node: # try: # price = price_node.xpath('./text()').extract()[0] # price = self.reformat(price) # if price: # metadata['price'] = price # except(TypeError, IndexError): # pass # # # 有两个部分都应该是description # # 这是图片右边的部分 # description1 = None # description_node1 = sel.xpath('//div[@class="l-info-description"]/div/div[contains(@class, "description")]') # if description_node1: # try: # description1 = description_node1.xpath('./text()').extract()[0] # description1 = self.reformat(description1) # except(TypeError, IndexError): # pass # # 这是图片左下的部分 # description2 = None # description_node2 = sel.xpath( # '//div[@class="l-details"]/div[contains(@class, "information")]/div[contains(@class, "description")]/div[@style]') # if description_node2: # try: # description2 = description_node2.xpath('./text()').extract()[0] # description2 = self.reformat(description2) # except(TypeError, IndexError): # pass # # 组合两部分 # description = '\r'.join( # filter(None, [description1, description2]) # ) # description = self.reformat(description) # if description: # metadata['description'] = description # # detail_nodes = sel.xpath( # '//div[@class="l-details"]/div[contains(@class, "technical")]/*[not(@id="technicaldetails")][not(contains(@class, "button"))]') # if detail_nodes: # # def func(node): # try: # node_name = node._root.tag # allText = ''.join(self.reformat(val) for val in node.xpath('./text()').extract()) # # h5标签说明他是一行的开头 # if node_name == 'h5': # return '\r' + allText # else: # return allText # except(TypeError, IndexError): # return '' # # try: # detail = ''.join(func(node) for node in detail_nodes) # detail = self.reformat(detail) # if detail: # metadata['details'] = detail # except(TypeError, IndexError): # pass image_urls = [] image_nodes = sel.xpath('//div[@id="scroll"]/ul/li[@data-hdimage]') for image_node in image_nodes: try: url = image_node.xpath('./@data-hdimage').extract()[0] url = self.reformat(url) if url: url = self.process_href(url, response.url) if url: image_urls += [url] except (TypeError, IndexError): continue # if image_nodes: # try: # image_urls = [ # self.process_href(val, response.url) # for val in image_nodes.xpath('./@data-hdimage').extract() # ] # except(TypeError, IndexError): # pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_nodes = sel.xpath( '//div[@id="pdpATCDivsubProductDiv"]//ul[@id="swatchesselect"]/li/a[@name]' ) for node in other_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@name').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_disount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description details = self.fetch_details(response) if details: metadata['details'] = details colors = self.fetch_color(response) if colors: metadata['color'] = colors image_urls = [] image_node = sel.xpath('//input[@id="pdpImgUrl"][@value]') if image_node: try: image_request_value = image_node.xpath('./@value').extract()[0] if image_request_value: m = copy.deepcopy(metadata) image_request_ref = str.format( 'http://s7d5.scene7.com/is/image/ToryBurchLLC/{0}_S?req=imageset', image_request_value) yield Request(url=image_request_ref, callback=self.parse_image_request, errback=self.onerr, meta={'userdata': m}) image_urls += [ str.format( 'http://s7d5.scene7.com/is/image/ToryBurchLLC/{0}?scl=2', image_request_value) ] except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_sku1(self, response): self.log(str.format('PARSE_SKU1: {0}', response.url), level=log.DEBUG) mt = re.search(r'chanel\.com/([^/]+)/', response.url) region = None for a, b in self.spider_data['base_url'].items(): if b == mt.group(1): region = a break if not region: return mt = re.search(r'\?sku=(\d+)$', response.url) if not mt: return model = mt.group(1) metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'model': model, 'url': response.url, 'tags_mapping': {}, 'category': set([]) } sel = Selector(response) cat_idx = 0 cat_list = [] for node in sel.xpath( '//div[contains(@class,"trackingSettings")]/span[@class]'): cat = unicodify(node._root.text) if not cat: continue #if node._root.attrib['class'] == 'WT_cg_s': # if 'category' not in metadata: # metadata['category'] = set([]) # metadata['category'].add(cat.lower()) if cat.lower() in cat_list: continue cat_idx += 1 cat_list.append(cat.lower()) metadata['tags_mapping'][str.format('category-{0}', cat_idx)] = [{ 'name': cat.lower(), 'title': cat }] gender = cm.guess_gender(cat) if gender: if 'gender' not in metadata: metadata['gender'] = set([]) metadata['gender'].add(gender) temp = sel.xpath('//div[@class="productName"]') name_list = [] if len(temp) > 0: product_name = temp[0] temp = product_name.xpath( './h1[@class="family"]/span[@class="familyText"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) name = u', '.join([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) if name: name_list.append(name.strip()) temp = product_name.xpath('./h2[@class="name"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) name = u', '.join([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) if name: name_list.append(name.strip()) name = u' - '.join(name_list) metadata['name'] = name if name else None # Description and details temp = sel.xpath('//div[@class="tabHolderFullWidth tabHolder"]') if len(temp) > 0: content_node = temp[0] content_map = {} for node in content_node.xpath('./div[@class="tabs"]//a[@rel]'): temp = unicodify(node._root.text) if temp and temp in self.spider_data['description_hdr']: content_map['description'] = node._root.attrib['rel'] if temp and temp in self.spider_data['details_hdr']: content_map['details'] = node._root.attrib['rel'] for term in ('description', 'details'): if term in content_map: temp = content_node.xpath( str.format('./div[@id="{0}"]', content_map[term])) if len(temp) > 0: content_list = [] content = unicodify(temp[0]._root.text) if content: content_list.append(content) content_list.extend([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) metadata[term] = u', '.join(content_list) # Images # image_urls = [] # for node in hxs.select('//div[@class="major productImg"]/img[@src]'): # href = node._root.attrib['src'] # if re.search(r'^http://', href): # image_urls.append(href) # else: # image_urls.append(str.format('{0}/{1}', self.spider_data['host'], href)) # image_urls = list(set([re.sub(r'\.+', '.', val) for val in image_urls])) image_urls = list( set( cm.norm_url(node._root.attrib['src'], self.spider_data['base_url']) for node in sel.xpath( '//div[@class="major productImg"]/img[@src]') if node._root.attrib['src'] and node._root.attrib['src'].strip())) if 'color' in metadata: metadata['color'] = list(metadata['color']) if 'gender' in metadata: metadata['gender'] = list(metadata['gender']) #metadata['category'] = list(metadata['category']) if 'model' in metadata: item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_details(self, response): metadata = response.meta['userdata'] metadata['url'] = response.url sel = Selector(response) name = self.fetch_name(response) if name: metadata['name'] = name detail = self.fetch_details(response) if detail: metadata['details'] = detail model = self.fetch_model(response) if model: metadata['model'] = model else: return description = self.fetch_description(response) if description: metadata['description'] = description ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] colors = self.fetch_color(response) if colors: metadata['color'] = colors # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract() # 获得图片 hdr = None tail = None img0 = sel.xpath( '//meta[@property="og:image" and @content]/@content').extract() if img0: img0 = img0[0] mt = re.search(r'(.+)_\d+_\w(\..+)$', img0) if mt: hdr = mt.group(1) tail = mt.group(2) idx = response.body.find('jsinit_item') img_item = None if idx != -1: tmp = response.body[idx:] idx = tmp.find('ALTERNATE') if idx != -1: try: img_item = json.loads( cm.extract_closure(tmp[idx:], r'\[', r'\]')[0]) except ValueError: pass image_urls = [] if hdr and tail and img_item: for item in img_item: mt = re.search(r'(\d+)_\w', item) if not mt: continue start_idx = int(mt.group(1)) for idx in xrange(start_idx, 15): tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item) image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail)) item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_details_us(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url # 查找不同的颜色版本 try: idx = response.body.find('var productURLs') data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"')) for color_key in data: tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key)) if not tmp: continue color_node = tmp[0] # 是否为当前选择的颜色? if not color_node.xpath('@selected'): m = copy.deepcopy(metadata) tmp = color_node.xpath('text()').extract() if tmp: m['color'] = [self.reformat(tmp[0])] yield Request(url=self.process_href(data[color_key], response.url), callback=self.spider_data['callbacks'][metadata['region']][2], errback=self.onerr, meta={'userdata': m}) else: tmp = color_node.xpath('text()').extract() if tmp: metadata['color'] = [self.reformat(tmp[0])] except ValueError: pass name = self.fetch_name(response) if name: metadata['name'] = name model = self.fetch_model(response) if model: metadata['model'] = model else: return description = self.fetch_description(response) if description: metadata['description'] = description ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = [] for img_node in sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul' '/li[contains(@id,"productAngle")]//img[@src or @data-url]'): tmp = img_node.xpath('@data-url').extract() if tmp: image_urls.append(self.process_href(tmp[0], response.url)) else: tmp = img_node.xpath('@src').extract()[0] a, b = os.path.splitext(tmp) image_urls.append(self.process_href(str.format('{0}_zoom{1}', a, b), response.url)) #image_urls = [self.process_href(val, response.url) for val in # sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul' # '/li[contains(@id,"productAngle")]/img[@src and @data-url]/@data-url').extract()] item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_product_nodes = sel.xpath( '//div[@id="content"]//div[@id="product-content"]//div[@class="product-variations"]/ul/li/div/ul[@class="swatches Color"]/li/a[@href]' ) for node in other_product_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = [] origin_image_node = sel.xpath( '//div[@id="content"]//div[@id="pdp-pinterest-container"]/img[@src]' ) if origin_image_node: try: origin_image_url = origin_image_node.xpath( './@src').extract()[0] origin_image_url = self.process_href(origin_image_url, response.url) origin_image_url = re.sub( ur'\?.*$', ur'_A1?$Demandware%20Large%20Rectangle$', origin_image_url) if origin_image_url: image_urls += [origin_image_url] image_urls += [ re.sub(ur'_A\d\?', str.format(r'_A{0}?', val), origin_image_url) for val in xrange(2, 5) ] except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product_us(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_nodes = sel.xpath( '//div[contains(@class, "product-detail")]//ul[@class="swatches Color"]/li/a[@href]' ) for node in other_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_us, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description image_urls = [] image_nodes = sel.xpath( '//div[@id="primary"]//div[@class="product-thumbnails"]/ul/li/a[@href]' ) for node in image_nodes: try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) href = re.sub(r'\?.*', '', href) if href: image_urls += [href] except (TypeError, IndexError): continue if not image_urls: image_node = sel.xpath('//div[@id="primary-image"]/a[@href]') try: href = image_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) href = re.sub(r'\?.*', '', href) mt = re.search(r'noimage', href) if not mt: if href: image_urls += [href] except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product_ca(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_nodes = sel.xpath('//ul[@id="product_thumbnails"]/li/a[@href]') for node in other_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except (TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product_ca, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description image_urls = None image_node = sel.xpath( '//div[@class="product-main-info"]//div[@class="float-left"]/div/a[child::img[@src]][@href]' ) if image_node: try: image_urls = [ self.process_href(val, response.url) for val in image_node.xpath('./@href').extract() ] except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) metadata['url'] = response.url model = None model_node = sel.xpath( '//div[@id="info1"]/div[@class="padding15"]/table//tr[2]/td[not(child::*)][2]' ) if model_node: try: model = model_node.xpath('./text()').extract()[0] model = self.reformat(model) except (TypeError, IndexError): pass if model: metadata['model'] = model else: return name_node = sel.xpath('//div[@id="info1"]//h1') if name_node: try: name = name_node.xpath('./text()').extract()[0] name = self.reformat(name) if name: metadata['name'] = name except (TypeError, IndexError): pass # 价格是用js后加载的 default_price = None default_price_re = re.search(r'defaultPrice: "(.*)"', response.body) if default_price_re: try: default_price = default_price_re.group(1) default_price = self.reformat(default_price) default_price = re.sub(ur' ', ur' ', default_price) except (TypeError, IndexError): pass # 这里这个defaultComparePrice是原价 # 如果没有,就是没有打折 old_price = None old_price_re = re.search(r'defaultComparePrice: "(.*)"', response.body) if old_price_re: try: old_price = old_price_re.group(1) old_price = self.reformat(old_price) old_price = re.sub(ur' ', ur' ', old_price) except (TypeError, IndexError): pass if old_price: # 有打折 metadata['price'] = old_price if default_price: metadata['price_discount'] = default_price elif default_price: # 没打折 metadata['price'] = default_price # 颜色标签 colors = None color_nodes = sel.xpath( '//div[@id="tallasdiv"]/div[@class="colors_detail"]/div[@title]') if color_nodes: try: colors = [ self.reformat(val) for val in color_nodes.xpath('./@title').extract() ] except (TypeError, IndexError): pass if colors: metadata['color'] = colors # 这个所有放大图片的地址,实在源码中找到的 image_urls = None image_nodes = sel.xpath('//div[contains(@id, "superzoom_")]/div[@rel]') if image_nodes: try: image_urls = [ self.process_href(val, response.url) for val in image_nodes.xpath('./@rel').extract() ] except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) # 这里不进入其他页面,因为后边找图片的方法,可以把所有颜色的图片找全 # # 其他颜色页面 # color_href_nodes = sel.xpath('//div[@class="variationattributes"]/div[@class="swatches color"]/ul/li/a[@href]') # for node in color_href_nodes: # m = copy.deepcopy(metadata) # # href = node.xpath('./@href').extract()[0] # href = self.process_href(href, response.url) # # Request(url=href, # callback=self.parse_product, # errback=self.onerr, # meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = [] try: start = 0 while 1: mt = re.search(r'xlarge:', response.body[start:]) if mt: result = common.extract_closure(response.body[mt.start():], '\[', '\]') content = result[0] start = result[2] if 0 == start: break url_list = re.findall('"url":.*\'(.+)\?.*\'', content) for url in url_list: image_urls += [self.process_href(url, response.url)] else: break except (TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_sku2(self, response): self.log(str.format('PARSE_SKU2: {0}', response.url), level=log.DEBUG) mt = re.search(r'chanel\.com/([^/]+)/', response.url) region = None for a, b in self.spider_data['base_url'].items(): if b == mt.group(1): region = a break if not region: return mt = re.search(r'/sku/(\d+)$', response.url) if not mt: return model = mt.group(1) metadata = { 'region': region, 'brand_id': self.spider_data['brand_id'], 'model': model, 'url': response.url, 'tags_mapping': {} } sel = Selector(response) cat_idx = 0 cat_list = [] for node in sel.xpath( '//div[contains(@class,"trackingSettings")]/span[@class]'): cat = unicodify(node._root.text) if not cat: continue #if node._root.attrib['class'] == 'WT_cg_s': # metadata['category'].add(cat.lower()) if cat.lower() in cat_list: continue cat_idx += 1 cat_list.append(cat.lower()) cat_name = str.format('category-{0}', cat_idx) metadata['tags_mapping'][cat_name] = [{ 'name': cat.lower(), 'title': cat }] gender = cm.guess_gender(cat) if gender: if 'gender' not in metadata: metadata['gender'] = set([]) metadata['gender'].add(gender) temp = sel.xpath('//div[contains(@class, "product_detail_container")]') name_list = [] if len(temp) > 0: product_name = temp[0] temp = product_name.xpath('./h1[@class="product_name"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) temp = product_name.xpath('./h2[@class="product_subtitle"]') if len(temp) > 0: name = unicodify(temp[0]._root.text) if name: name_list.append(name) temp = product_name.xpath('.//h3[@class="product_price"]') if len(temp) > 0: metadata['price'] = unicodify(temp[0]._root.text) name = u' - '.join(name_list) metadata['name'] = name if name else None # Description and details temp = sel.xpath('//div[@class="description_container"]') if len(temp) > 0: content_node = temp[0] content_map = {} for node in content_node.xpath( './/div[@class="accordion-heading"]/a[@href]'): temp = unicodify(node._root.text) if temp and temp in self.spider_data['description_hdr']: content_map['description'] = re.sub( r'^#', '', node._root.attrib['href']) if temp and temp in self.spider_data['details_hdr']: content_map['details'] = re.sub(r'^#', '', node._root.attrib['href']) for term in ('description', 'details'): if term in content_map: temp = content_node.xpath( str.format('.//div[@id="{0}"]', content_map[term])) if len(temp) > 0: content_list = [] content = unicodify(temp[0]._root.text) if content: content_list.append(content) content_list.extend([ unicodify(val.text) for val in temp[0]._root.iterdescendants() if val.text and val.text.strip() ]) metadata[term] = u', '.join(content_list) # Images image_urls = list( set( cm.norm_url(node._root.attrib['src'], self.spider_data['base_url']) for node in sel.xpath( '//section[@class="product_image_container"]/img[@src and @class="product_image"]' ) if node._root.attrib['src'] and node._root.attrib['src'].strip())) if 'color' in metadata: metadata['color'] = list(metadata['color']) if 'gender' in metadata: metadata['gender'] = list(metadata['gender']) #metadata['category'] = list(metadata['category']) if 'model' in metadata: item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) # country_node = sel.xpath('//div[@class="ecommerce-nav"]/ul/li/span[2][text()]') # if country_node: # try: # country = country_node.xpath('./text()').extract()[0] # self.log(str.format('region: {0} country : {1}', metadata['region'], country)) # except(TypeError, IndexError): # pass # TODO dkny的爬虫通过cookie切换国家,这里的url是无意义的,需要尝试用url切换到指定 metadata['url'] = response.url # 有货号不在URL中的 # 比如:http://www.dkny.com/bags/shop-by-shape/view-all/resort13bags145/dknypure-large-hobo?p=2&s=12 # 也有不在那个li的node中的 # 比如:http://www.dkny.com/sale/womens-sale/dresses/n43731afa/dknypure-dress-with-sleek-jersey-yoke-and-sleeves model = None model_node = sel.xpath('//li[@class="product"][@id]') if model_node: try: model_text = model_node.xpath('./@id').extract()[0] mt = re.search(r'-(\w+)$', model_text) if mt: model = mt.group(1) except (TypeError, IndexError): pass if not model: try: mt = re.search(r'.+/(\w+)/.+$', response.url) if mt: model = mt.group(1) if model: model = model.upper() except (TypeError, IndexError): pass if model: metadata['model'] = model else: return description_node = sel.xpath( '//div[contains(@class, "view-product_detail")]//div[@class="product-description"]' ) if description_node: try: description = '\r'.join( self.reformat(val) for val in description_node.xpath('.//text()').extract()) description = self.reformat(description) if description: metadata['description'] = description except (TypeError, IndexError): pass colors = None color_nodes = sel.xpath( '//div[@class="product-info-container"]//form/ul/li/ul/li/a/img[@alt]' ) if color_nodes: try: colors = [ self.reformat(val).lower() for val in color_nodes.xpath('./@alt').extract() ] except (TypeError, IndexError): pass if colors: metadata['color'] = colors image_urls = [] image_nodes = sel.xpath( '//div[contains(@class, "view-product_detail")]//div[@class="partial-product_viewer"]/ul/li/a/img[@src]' ) for image_node in image_nodes: try: src = image_node.xpath('./@src').extract()[0] src = self.process_href(src, response.url) # 这里,把src里边的/60/80/替换为/0/0/即可得到全尺寸图片 src = re.sub(r'/(\d+/\d+)/', '/0/0/', src) image_urls += [src] except (TypeError, IndexError): continue # # TODO 这里其他颜色的图片怎么取的 # # 这里发送请求,找到其他颜色图片 # # 这里好像有两种请求,一种用了link_id,model,value_id三个参数,一种用了model,value_id两个参数 # link_id = None # link_node = sel.xpath('//link[@rel="canonical"][@href]') # if link_node: # link_text = link_node.xpath('./@href').extract()[0] # if link_text: # mt = re.search(r'.+/(\w+)/.+$', link_text) # if mt: # link_id = mt.group(1).upper() # if link_id: # other_color_node = sel.xpath('//ul[@class="product-set"]//ul[@class="option-set"]//ul[@class="option-value-set"]/li[@id][child::a[child::img]]') # for node in other_color_node: # value_id = None # value_id_text = node.xpath('./@id').extract()[0] # if value_id_text: # mt = re.search(r'.+/(\w+)/.+$', value_id_text) # if mt: # value_id = mt.group(1) # if value_id: # m = copy.deepcopy(metadata) # # href = str.format('http://www.dkny.com/product/detailpartial?id={0}&variantId={1}', model, value_id) # # yield Request(url=href, # callback=self.parse_other_color, # errback=self.onerr, # meta={'meta': m}) item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_details(self, response): """ 解析“系列”下面的单品 @param response: """ metadata = response.meta['userdata'] sel = Selector(response) try: model = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="reference-number"]/' 'text()').extract()[0] if not model: return metadata['model'] = model except IndexError: return metadata['url'] = unicodify(response.url) if 'name' not in metadata or not metadata['name']: tmp = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//*[@class="format"]' '/text()').extract() if tmp: metadata['name'] = self.reformat(unicodify(tmp[0])) # 颜色 sub_products = sel.xpath( '//div[@id="product-detail"]/div[@class="inner-detail"]//ul[@class="color-list"]' '/li/a[@href]/@href').extract() for href in sub_products: if href in response.url: continue yield Request(url=self.process_href(href, response.url), callback=self.parse_details, errback=self.onerr, meta={'userdata': copy.deepcopy(metadata)}) try: metadata['description'] = self.reformat( unicodify( sel.xpath( '//div[@id="tabs-product-detail-overview"]' '/div[@class="product-detail-tab-content"]' '/p[@class="slide-paragraph"]/text()').extract()[0])) except IndexError: pass details_nodes = sel.xpath( '//div[@id="tabs-product-detail-specification"]/' 'div[@class="product-detail-tab-content"]//li/span[@class="tooltip" or ' '@class="title"]/..') details = self.reformat( unicodify('\r'.join(': '.join(node.xpath('*/text()').extract()) for node in details_nodes))) if details: metadata['details'] = details image_urls = [ self.process_href(val, response.url) for val in sel.xpath( '//div[@id="product-gallery"]/div[@class="product-gallery-part"]' '/div[contains(@class,"positioned-product")]/img[@src]/@src'). extract() ] image_urls.extend([ self.process_href(val, response.url) for val in sel.xpath( '//div[@id="product-gallery"]/div[@class="product-gallery-part"]' '/img[@src]/@src').extract() ]) item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item
def parse_details(self, response): # 确定所属国家 region = None for tmp in self.spider_data['domains']: if self.spider_data['domains'][tmp] in response.url: region = tmp break if not region: return metadata = {'region': region, 'brand_id': self.spider_data['brand_id'], 'tags_mapping': {}, 'url': response.url} # 根据referer,获得category信息 referer = response.request.headers['Referer'] if referer not in self.url_cat_dict: return Request(url=referer, callback=self.parse_cat, meta={'stash': response, 'coach-referer': referer, 'callback': self.parse_details}, errback=self.onerr, dont_filter=True) tag_list = self.url_cat_dict[referer] for tag in tag_list: metadata['tags_mapping'][tag['type']] = [{'name': tag['name'], 'title': tag['title']}] # 商品信息在var productJSONObject中 mt = re.search(r'var\s+productJSONObject\s*=', response.body) if not mt: return try: data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"', '"').replace(r"\'", "'")) except(TypeError, IndexError, ValueError): return if 'style' not in data: return metadata['model'] = data['style'] if 'productName' in data: metadata['name'] = self.reformat(data['productName']) try: metadata['color'] = [self.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches'] if 'color' in swatch] except KeyError: pass # 价格信息 try: for item in data['swatchGroup']['swatches']: if 'listPrice' in item: metadata['price'] = self.reformat(item['listPrice']) if 'unitPrice' in item: metadata['price_discount'] = self.reformat(item['unitPrice']) break except KeyError: pass # 图像链接 image_urls = [] try: image_host = 'http://s7d2.scene7.com/is/image/Coach/{0}{1}' style_for_images = data['styleForImages'] for item in data['swatchGroup']['swatches']: for subimg in ('aImages', 'nImages', 'mImages'): for tmp in [val['imageName'] for val in item[subimg]]: if tmp not in image_urls: image_urls.append(tmp) image_urls = [str.format(image_host, style_for_images, val) for val in image_urls] except KeyError: pass item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def parse_products(self, response): metadata = response.meta['userdata'] # self.log(unicode.format(u'PROCESSING {0} -> {1} -> {2}: {3}', metadata['extra']['category-0'][0], # metadata['extra']['category-1'][0], metadata['name'], response.url).encode('utf-8'), # log.DEBUG) for k in ('post_token', 'page_id'): if k in metadata: metadata.pop(k) sel = Selector(response) temp = sel.xpath( '//div[@class="product-header"]//span[@class="page-product-title"]' ) if temp: collection = unicodify(temp[0]._root.text) if collection: metadata['tags_mapping']['collection'] = [{ 'name': collection.lower(), 'title': collection }] model = self.fetch_model(response) if model: metadata['model'] = model else: return if 'name' not in metadata or not metadata['name']: name = self.fetch_name(response) if name: metadata['name'] = name description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] temp = sel.xpath( '//div[@class="column-images"]//a[@href and contains(@class,"zoom-trigger-link")]' ) image_urls = [ self.process_href(val._root.attrib['href'], response.url) for val in temp ] metadata['url'] = response.url item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata return item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) other_nodes = sel.xpath( '//div[@class="product-container"]//div[@class="prod-options"]/div[@class="colors"]/ul/li/a[@href]') for node in other_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = None image_node = sel.xpath('//link[@rel="image_src"][@href]') if image_node: try: image_href = image_node.xpath('./@href').extract()[0] image_href = re.sub(r'_\d+x\d+\$', '', image_href) image_urls = [image_href] except(TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) color_nodes = sel.xpath('//div[@id="product-content-detail"]//div[@class="swatch-Slider"]/ul/li/a[@href]') for node in color_nodes: m = copy.deepcopy(metadata) try: href = node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) except(TypeError, IndexError): continue yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] name = self.fetch_name(response) if name: metadata['name'] = name description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail colors = self.fetch_color(response) if colors: metadata['color'] = colors image_urls = [] try: image_nodes = sel.xpath('//div[@id="pdpMain"]//ul[@class="product-slides-list"]/li/a/img[@src]') for image_node in image_nodes: image_url = image_node.xpath('./@src').extract()[0] image_url = re.sub(r'\?.*', '', image_url) if image_url: image_urls += [image_url] except(TypeError, IndexError): pass item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_product(self, response): metadata = response.meta['userdata'] sel = Selector(response) # 进入不同颜色的单品页,它给了不同的单品号 other_nodes = sel.xpath('//dl[@id="media-tabs"]/dd[2]//a') for other_node in other_nodes: m = copy.deepcopy(metadata) href = other_node.xpath('./@href').extract()[0] href = self.process_href(href, response.url) yield Request(url=href, callback=self.parse_product, errback=self.onerr, meta={'userdata': m}) metadata['url'] = response.url model = self.fetch_model(response) if model: metadata['model'] = model else: return name = self.fetch_name(response) if name: metadata['name'] = name gender = common.guess_gender(name, extra={ 'male': [], 'female': ['lady'] }) if gender: metadata['gender'] = [gender] colors = self.fetch_color(response) if colors: metadata['color'] = colors description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail image_urls = None image_nodes = sel.xpath( '//dl[@id="media-tabs"]/dd/div[@class="more-views"]/ul/li/a[@href]' ) if image_nodes: image_urls = [ self.process_href(val, response.url) for val in image_nodes.xpath('./@href').extract() ] ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] item = ProductItem() item['url'] = metadata['url'] item['model'] = metadata['model'] if image_urls: item['image_urls'] = image_urls item['metadata'] = metadata yield item
def parse_details(self, response): self.log(unicode.format(u'PARSE_DETAILS: URL={0}', response.url).encode('utf-8'), level=log.DEBUG) metadata = response.meta['userdata'] hxs = Selector(response) # 访问商品的其它颜色版本 ret = hxs.xpath( "//div[contains(@class,'colors')]/ul[contains(@class,'color-set')]" "/li[contains(@class,'color') and not(contains(@class,'color-selected'))]" "/a[@title and @data-color-link]") for node in ret: m = copy.deepcopy(metadata) m['color'] = [ self.reformat(unicodify( node.xpath('@title').extract()[0])).lower() ] url = self.process_href( node.xpath('@data-color-link').extract()[0], response.url) m['url'] = url yield Request(url=url, callback=self.parse_details, errback=self.onerr, meta={'userdata': m}) colors = self.fetch_color(response) if colors: metadata['color'] = colors model = self.fetch_model(response) if model: metadata['model'] = model else: return ret = self.fetch_price(response) if 'price' in ret: metadata['price'] = ret['price'] if 'price_discount' in ret: metadata['price_discount'] = ret['price_discount'] description = self.fetch_description(response) if description: metadata['description'] = description detail = self.fetch_details(response) if detail: metadata['details'] = detail name = self.fetch_name(response) if name: metadata['name'] = name if 'name' in metadata and 'details' in metadata and 'description' in metadata: ret = hxs.xpath( "//div[@class='product_detail_container']/div[@class='product_viewer']" "//ul[@class='product-media-set']/li[@class='product-image']/img[@src]/@src" ).extract() image_urls = [self.process_href(val, response.url) for val in ret] item = ProductItem() item['image_urls'] = image_urls item['url'] = metadata['url'] item['model'] = metadata['model'] item['metadata'] = metadata yield item else: self.log( unicode.format(u'INVALID ITEM: {0}', metadata['url']).encode('utf-8'), log.ERROR)