def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list_info = '' m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1).strip() tab_list = [] p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1))) print tab_info.group(2).strip(),self.home_url+tab_info.group(1) i = 0 for tab in tab_list: refers = url tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) while m: refers = tab_url tab_url = re.sub(r'/to-\d+', '', tab_url) + "/to-%s"%m.group(1) tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_name, s_price = item.group(1),item.group(2),item.group(3) print self.home_url+i_url, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name): self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit)) else: if Common.isBag(i_name): i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '') self.items.append(i.outItem())
def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4) i_name = re.sub(r'<.+?>','',s_name) i_price, i_unit = '', '' m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S) if m: s_price = re.sub(r'<.+?>','',m.group(1)) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile(r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append((tab.group(2)+tab.group(3).strip(),tab.group(1))) for tab in tab_list: tab_name,tab_url = tab print '# tab:',tab_name,tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search(r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group(1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/","/") #print data_html #break p = re.compile(r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url+item.group(3), item.group(4), item.group(2), re.sub(r'<.+?>','',item.group(1)).strip() i_number = '' m = re.search(r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append((tab_name, tab_url, i_name, i_url, i_img, i_number))
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile( r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append( (tab.group(2) + tab.group(3).strip(), tab.group(1))) for tab in tab_list: tab_name, tab_url = tab print '# tab:', tab_name, tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search( r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group( 1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/", "/") #print data_html #break p = re.compile( r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url + item.group( 3), item.group(4), item.group(2), re.sub( r'<.+?>', '', item.group(1)).strip() i_number = '' m = re.search( r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append( (tab_name, tab_url, i_name, i_url, i_img, i_number))