def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return serie_title = '包袋' p = re.compile(r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>', flags=re.S) for item in p.finditer(page): tab_name, i_img, i_url, i_name, s_unit, s_price = item.group(1).strip(),item.group(2),item.group(3),item.group(4).strip(),item.group(5),item.group(6) i_unit = "" if s_unit.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>','',s_price).strip() print tab_name, i_img, self.home_url+i_url, i_name, i_unit, i_price if i_url and i_url != '': self.link_list.append((serie_title,tab_name,url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) page_num = 2 ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=¯o=1147µ=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=¯oMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp" a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) while result: page_num += 1 a_url = re.sub('page=\d+&', 'page=%d&'%page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url)
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<ul class="tabsList collections">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile(r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append((tab.group(3).strip(),self.home_url+tab.group(2),url+tab.group(1))) for tab in tab_list: tab_name,tab_data_url,tab_url = tab print '# tab:',tab_name,tab_data_url,tab_url tab_page = self.crawler.getData(tab_data_url, url) p = re.compile(r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name = self.home_url+item.group(1), self.home_url+item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append((tab_name,tab_url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, '', '', '', i_url, self.home_url+i_img) self.items.append(i.outItem())
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == "": return page = self.crawler.getData(i_url, refers) if not page or page == "": return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: i_name = " ".join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace("¥", "").replace("¥", "").strip() m = re.search( r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S ) if m: i_img = m.group(1) i_size = "" i_number = "" m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, "", i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print "# itemPage:", i.outItem()
def itemPage(self, val): serie_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_title, i_img, i_size, i_price, i_unit = '', '', '', '', '' m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: i_title = m.group(1).strip() m = re.search(r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search(r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: i_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S) if m: i_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S) if m: i_size = m.group(1).strip() i_number = '' m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
def itemPage(self, val): serie_title, i_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', '' m = re.search(r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1).split('-')[0].strip() m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': i_size += '-' + size.group(1) else: i_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url %ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"]["currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: i_price += '-' + price else: if price: i_price = price if unit: i_unit = unit i = BagItem(self.brand_type) i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search( r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) m = re.search( r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, i_price = m.group(1), re.sub(r'<.*>', '', m.group(2)) if currency.find("¥") != -1: i_unit = "CNY" else: i_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>', '', m.group(1)) #i_size = "".join(size_str.split()) i_size = re.sub(r'\s*', '', size_str) print "".join(i_size.split()) i_number = '' m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem)
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search( r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search( r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) i_size = s_size.split(':')[1] if i_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: i_size = re.sub(r'<.+?>', '', m.group(1)) i_number = '' m = re.search( r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem()
def itemPage(self, val): item_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>', '', s_price).replace('¥', '').strip() m = re.search( r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) i_number = '' m = re.search( r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem()
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return p = re.compile(r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>', flags=re.S) for item in p.finditer(page): i_url, i_img, i_name = item.group(1),item.group(2),item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('',url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem('',url,i_name,i_url,i_img) self.items.append(i.outItem()) p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>',flags=re.S) for item in p.finditer(page): data_info = item.group(1) data_info_str = data_info.replace('"','"') i_url, i_img, i_name = '', '', '' m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S) if m: i_url = m.group(1) m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S) if m: i_name = m.group(1) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('',url,i_name,i_url,i_img)) else: i = BagItem(self.brand_type) i.initItem('',url,i_name,i_url,i_img) self.items.append(i.outItem())
def ajax_item(self, page, refers): if not page or page == '': return False try: result = json.loads(page) if result.has_key("ApiResult"): r_ApiResult = result["ApiResult"] if r_ApiResult.has_key("Items"): for item in r_ApiResult["Items"]: tab_name, i_img, i_url, i_name, i_price = "", "", "", "", "" if item.has_key("MicroCategory"): tab_name = item["MicroCategory"].strip() if item.has_key("DefaultCode10"): item_code10 = item["DefaultCode10"] if item.has_key("ImageTypes"): if "12_f" in item["ImageTypes"]: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % ( item_code10, "12_f") else: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg" % ( item_code10, max(item["ImageTypes"])) if item.has_key("SingleSelectLink"): i_url = self.home_url + item[ "SingleSelectLink"].strip() if item.has_key("TitleAttribute"): i_name = item["TitleAttribute"].strip() if item.has_key("FullPrice"): i_price = '{0:,}'.format(int(item["FullPrice"])) i_unit = "CNY" print tab_name, i_name, i_url, i_img, i_price, i_unit if i_url and i_url != '': self.link_list.append( (tab_name, refers, i_name, i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem('', tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) if result.has_key("Page"): r_Page = result["Page"] if r_Page.has_key("CurrentSearchPage") and r_Page.has_key( "TotalPages"): if int(r_Page["CurrentSearchPage"]) < int( r_Page["TotalPages"]): return True return False except Exception as e: print e return False
def itemPage(self, val): serie_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_title, i_img, i_size, i_price, i_unit = '', '', '', '', '' m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: i_title = m.group(1).strip() m = re.search( r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)" alt="" />\s+</a>\s+</li>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search( r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: i_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', i_desc, flags=re.S) if m: i_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', i_desc, flags=re.S) if m: i_size = m.group(1).strip() i_number = '' m = re.search( r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img
def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, i_name, s_price = item.group(1),item.group(2),item.group(3),item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').replace('¥','').strip() if i_url and i_url != '': print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list_info = '' m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1).strip() tab_list = [] p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1))) print tab_info.group(2).strip(),self.home_url+tab_info.group(1) i = 0 for tab in tab_list: refers = url tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) while m: refers = tab_url tab_url = re.sub(r'/to-\d+', '', tab_url) + "/to-%s"%m.group(1) tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_name, s_price = item.group(1),item.group(2),item.group(3) print self.home_url+i_url, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name): self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit)) else: if Common.isBag(i_name): i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '') self.items.append(i.outItem())
def itemPage(self, val): serie_title, refers, i_name, i_url, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div class="productName title" id="productName">\s*<h1 itemprop="name">(.+?)</h1>\s*</div>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<table class="priceButton">\s*<tr>\s*<td class="priceValue price-sheet">(.+?)</td>', page, flags=re.S) if m: i_price = m.group(1).strip() if i_price.find("¥") != -1: i_unit = "CNY" m = re.search(r'<noscript>\s*<img src="(.+?)".+?itemprop="image".*?/>\s*</noscript', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="textClientInfo exp_content".*?>\s*<div class="innerContent functional-text">(.+?)</div>', page, flags=re.S) if m: s_content = m.group(1).replace(' ','').strip() if s_content.find('宽)') != -1: s_size = s_content.split('宽)')[0] self.item_size = re.sub('<.+?>','',s_size) + "宽)" elif s_content.find('高)') != -1: s_size = s_content.split('高)')[0] self.item_size = re.sub('<.+?>','',s_size) + "高)" else: s_size = ''.join(s_content.split()) i_number m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S) if m: i_number = m.group(1).strip() else: m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S) if m: i_number = m.group(1).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem()
def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>', flags=re.S, ) for item in p.finditer(items_info): i_url, i_img, i_name, s_price = item.group(1), item.group(2), item.group(3), item.group(4) i_price, i_unit = "", "" if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace("¥", "").replace("¥", "").strip() if i_url and i_url != "": print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, "", i_name, i_price, i_unit, "", i_url, i_img) self.items.append(i.outItem())
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search(r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) m = re.search(r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, i_price = m.group(1), re.sub(r'<.*>','',m.group(2)) if currency.find("¥") != -1: i_unit = "CNY" else: i_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>','',m.group(1)) #i_size = "".join(size_str.split()) i_size = re.sub(r'\s*','',size_str) print "".join(i_size.split()) i_number = '' m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem)
def ajax_item(self, page, refers): if not page or page == '': return False try: result = json.loads(page) if result.has_key("ApiResult"): r_ApiResult = result["ApiResult"] if r_ApiResult.has_key("Items"): for item in r_ApiResult["Items"]: tab_name, i_img, i_url, i_name, i_price = "", "", "", "", "" if item.has_key("MicroCategory"): tab_name = item["MicroCategory"].strip() if item.has_key("DefaultCode10"): item_code10 = item["DefaultCode10"] if item.has_key("ImageTypes"): if "12_f" in item["ImageTypes"]: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,"12_f") else: i_img = "http://cdn.yoox.biz/55/%s_%s.jpg"%(item_code10,max(item["ImageTypes"])) if item.has_key("SingleSelectLink"): i_url = self.home_url + item["SingleSelectLink"].strip() if item.has_key("TitleAttribute"): i_name = item["TitleAttribute"].strip() if item.has_key("FullPrice"): i_price = '{0:,}'.format(int(item["FullPrice"])) i_unit = "CNY" print tab_name,i_name,i_url,i_img,i_price,i_unit if i_url and i_url != '': self.link_list.append((tab_name,refers,i_name,i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem('', tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) if result.has_key("Page"): r_Page = result["Page"] if r_Page.has_key("CurrentSearchPage") and r_Page.has_key("TotalPages"): if int(r_Page["CurrentSearchPage"]) < int(r_Page["TotalPages"]): return True return False except Exception as e: print e return False
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search(r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" i_number m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem()
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' i_number = '' m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem()
def crawl(self): while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e break _val = _data[1] item = BagItem(self.home_url, self.brand_type) item.antPage(_val) self.push_back(self.items, item.outItem()) sql = item.outTuple() self.mysqlAccess.insert_item(sql) # 延时 time.sleep(0.1) # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() time.sleep(5)
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<ul class="tabsList collections">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile( r'<li class=".+?">\s+<a href="(.+?)" data-magento_call_page="(.+?)".+?>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append( (tab.group(3).strip(), self.home_url + tab.group(2), url + tab.group(1))) for tab in tab_list: tab_name, tab_data_url, tab_url = tab print '# tab:', tab_name, tab_data_url, tab_url tab_page = self.crawler.getData(tab_data_url, url) p = re.compile( r'<li class="li-product.+?>\s+<a href="(.*?)" class="linkProduct">.+?<img src="(.+?)".+?/>.+?<span class="description".+?>.+?<span class="title">(.+?)</span>.+?</span>\s+</a>\s+</li>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name = self.home_url + item.group( 1), self.home_url + item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append( (tab_name, tab_url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, '', '', '', i_url, self.home_url + i_img) self.items.append(i.outItem())
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: i_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').replace('¥','').strip() m = re.search(r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) i_size = s_size.split(':')[1] if i_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: i_size = re.sub(r'<.+?>','',m.group(1)) i_number = '' m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem()
def itemPage(self, val): item_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>','',s_price).replace('¥','').strip() m = re.search(r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) i_number = '' m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem()
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return p = re.compile( r'<div id="slot_\d+".+?<a.+?href="(.+?)".+?>\s*<img.+?src="(.+?)".+?/>\s*<div class="iteminfo">\s*<div class="headgroup">\s*<div class="extra">\s*<h1 class="modelname">(.+?)</h1>', flags=re.S) for item in p.finditer(page): i_url, i_img, i_name = item.group(1), item.group(2), item.group(3) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('', url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem('', url, i_name, i_url, i_img) self.items.append(i.outItem()) p = re.compile(r'<div class="slot lazySlot".+?data-slot="(.+?)".+?>', flags=re.S) for item in p.finditer(page): data_info = item.group(1) data_info_str = data_info.replace('"', '"') i_url, i_img, i_name = '', '', '' m = re.search(r'"Link":"(.+?)"', data_info_str, flags=re.S) if m: i_url = m.group(1) m = re.search(r'"ModelName":"(.+?)",', data_info_str, flags=re.S) if m: i_name = m.group(1) print i_url, i_img, i_name if i_url and i_url != '': self.link_list.append(('', url, i_name, i_url, i_img)) else: i = BagItem(self.brand_type) i.initItem('', url, i_name, i_url, i_img) self.items.append(i.outItem())
def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<div class="large.+?columns.+?">\s*<a href="(.+?)">\s*<img.*?src="(.+?)".*?/><span class="prodcaption">(.+?)</br>(.+?)</span>\s*</a>', flags=re.S) for item in p.finditer(items_info): i_url, i_img, i_name, s_price = item.group(1), item.group( 2), item.group(3), item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() if i_url and i_url != '': print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append( (tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def run_items(self, items_info, tab_name, tab_url): p = re.compile( r'<li class="productlist-item ">\s*<div class="product-image".+?>\s*<a.+?><img src="(.+?)".+?/>\s*</a>\s*</div>.+?<div class="product-title">\s*<a href="(.+?)".+?>(.+?)</a>\s*</div>.+?<p>\s*<span class="product-price">(.+?)</span>\s*</p>\s*</li>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, i_name, s_price = item.group(1), item.group( 2), item.group(3), item.group(4) i_price, i_unit = '', '' if s_price.find("¥") != -1 or s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').replace('¥', '').strip() if i_url and i_url != '': print self.home_url + i_url, i_img, i_name, i_price, i_unit self.link_list.append( (tab_name, tab_url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: print self.home_url + i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4) i_name = re.sub(r'<.+?>','',s_name) i_price, i_unit = '', '' m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S) if m: s_price = re.sub(r'<.+?>','',m.group(1)) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return serie_title = '包袋' p = re.compile( r'<li data-position="\d+\s*" class="product isAvailable".+?data-category="(.+?)".+?>\s*<div class="prodContent"><div class="imagesContainer".+?>.+?<img.+?data-original="(.+?)".+?>.+?</div>\s*<div class="\s*productDescription\s*">\s*<a href="(.+?)".+?><h2.+?>(.+?)</h2>\s*</a>\s*<div class="price">.+?<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>.+?</li>', flags=re.S) for item in p.finditer(page): tab_name, i_img, i_url, i_name, s_unit, s_price = item.group( 1).strip(), item.group(2), item.group(3), item.group( 4).strip(), item.group(5), item.group(6) i_unit = "" if s_unit.find("¥") != -1: i_unit = "CNY" i_price = re.sub(r'<.+?>', '', s_price).strip() print tab_name, i_img, self.home_url + i_url, i_name, i_unit, i_price if i_url and i_url != '': self.link_list.append( (serie_title, tab_name, url, i_name, self.home_url + i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem(serie_title, tab_name, i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem()) page_num = 2 ajax_url = "http://www.dolcegabbana.com.cn/yeti/api/DOLCEEGABBANA_CN/searchIndented.json?page=2&sortRule=PriorityDescending&format=full&authorlocalized=¯o=1147µ=&color=&look=&size=&gender=D&season=P%2CE&department=&brand=&heel=&heeltype=&wedge=&washtype=&washcode=&colortype=&fabric=&waist=&family=&structure=&environment=&author=&textSearch=&minPrice=&maxPrice=&occasion=&salesline=&prints=&stone=&material=&agerange=&productsPerPage=20&gallery=¯oMarchio=&modelnames=&GroupBy=&style=&site=DOLCEEGABBANA&baseurl=http://www.dolcegabbana.com.cn/searchresult.asp" a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url) while result: page_num += 1 a_url = re.sub('page=\d+&', 'page=%d&' % page_num, ajax_url) a_page = self.crawler.getData(a_url, url) result = self.ajax_item(a_page, url)
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img, i_price, i_unit = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search(r'<div id="itemInfo">.+?<h1><span class="customItemDescription" itemprop="name">(.+?)</span></h1>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search(r'<div id="itemPrice".+?><div.*?class="newprice">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() s_price = re.sub(r'<.+?>','',s_price) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() else: i_price = s_price m = re.search(r'<div id="mainImageContainer"><img.+?src="(.+?)".*?/></div>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S) if m: i_size = m.group(1) i_number m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem()
def bagPage(self): tab_list = [ ("giorgio armani","http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"), ("emporio armani","http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B"), ("armani jeans","http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B")] for tab in tab_list: tab_name,tab_data_url = tab print '# tab:',tab_name,tab_data_url tab_page = self.crawler.getData(tab_data_url, self.home_url) p = re.compile(r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name, s_price = self.home_url+item.group(1), item.group(2), item.group(4).strip(), item.group(5) print i_url, i_img, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': self.link_list.append((tab_name,tab_data_url,i_name,i_url,i_img,i_price,i_unit)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def bagPage(self): tab_list = [ ("giorgio armani", "http://www.armani.cn/cn/giorgioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ), ("emporio armani", "http://www.armani.cn/cn/emporioarmani/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ), ("armani jeans", "http://www.armani.cn/cn/armanijeans/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" ) ] for tab in tab_list: tab_name, tab_data_url = tab print '# tab:', tab_name, tab_data_url tab_page = self.crawler.getData(tab_data_url, self.home_url) p = re.compile( r'<div class="item hproduct".+?>.+?<a href="(.+?)".+?class="url">\s*<div class="hproductPhotoCont">\s*<img.+?(src|data-original)="(.+?)".*?/>\s*</div>\s*</a>\s*<div class="itemDesc">\s*<a.+?>\s*<h3.+?>(.+?)</h3>\s*</a>.+?<div class="itemPrice">.+?<span class="prezzoProdottoSaldo".*?>(.+?)</span>\s*</div>.+?</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_img, i_name, s_price = self.home_url + item.group( 1), item.group(2), item.group(4).strip(), item.group(5) print i_url, i_img, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥', '').strip() if i_url and i_url != '': self.link_list.append((tab_name, tab_data_url, i_name, i_url, i_img, i_price, i_unit)) else: i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def itemPage(self, val): serie_title, refers, i_name, i_url, i_img = val if i_url == '': return page = self.crawler.getData(i_url, refers) if not page or page == '': return m = re.search( r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: i_name = m.group(1).strip() m = re.search( r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: i_img = m.group(1) else: m = re.search( r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: i_img = m.group(1) i_size = '' m = re.search( r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: i_size += m.group(1) + ":" + m.group(2) + ";" i_number m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem()
def itemPage(self, val): serie_title, i_title, refers, i_name, i_url = val page = self.crawler.getData(i_url, refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit, i_number = '', '', '', '', '', '', '' m = re.search( r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: i_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1).split('-')[0].strip() m = re.search(r'<div class="productimage.*?"><img src="(.+?)".*?/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': i_size += '-' + size.group(1) else: i_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url % ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"][ "currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: i_price += '-' + price else: if price: i_price = price if unit: i_unit = unit i = BagItem(self.brand_type) i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number)