class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # wait time self.w_time = 1 def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 print '# Invalid page exception:', e time.sleep(random.uniform(10, 30)) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 print '# Deny page exception:', e time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 print '# System busy exception:', e time.sleep(random.uniform(10, 30)) except Exception as e: print '# exception err in retry crawler:', e if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 elif str(e).find('Name or service not known') != -1 or str( e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 time.sleep(random.uniform(10, 30)) else: break return page
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # wait time self.w_time = 1 def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 print '# Invalid page exception:',e time.sleep(random.uniform(10,30)) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 print '# Deny page exception:',e time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 print '# System busy exception:',e time.sleep(random.uniform(10,30)) except Exception as e: print '# exception err in retry crawler:',e if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 time.sleep(random.uniform(10,30)) else: break return page
class McmBag(): '''A class of mcm bag''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # 品牌官网链接 self.home_url = 'http://www.mcmworldwide.com' self.women_url = self.home_url + '/en/women' self.bag_url = self.women_url + '/bags' self.backpack_url = self.women_url + '/backpacks' self.leather_url = self.women_url + '/small-leather-goods' self.refers = None # 抓取商品列表 self.links = [] self.items = [] def bagPage(self): url = self.bug_url + '#start=0&sz=32&srule=New' page = self.crawler.getData(self.bag_url, self.women_url) if not page or page == '': return
class McmBag(): '''A class of mcm bag''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # 品牌官网链接 self.home_url = 'http://www.mcmworldwide.com' self.women_url = self.home_url + '/en/women' self.bag_url = self.women_url + '/bags' self.backpack_url= self.women_url + '/backpacks' self.leather_url = self.women_url + '/small-leather-goods' self.refers = None # 抓取商品列表 self.links = [] self.items = [] def bagPage(self): url = self.bug_url + '#start=0&sz=32&srule=New' page = self.crawler.getData(self.bag_url, self.women_url) if not page or page == '': return
class BagItem(): '''A class of bag''' def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = '' def initItem(self, serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number): self.serie_title = serie_title self.item_title = i_title self.item_name = i_name self.item_price = i_price self.item_unit = i_unit self.item_size = i_size self.item_url = i_url self.item_img = i_img self.item_number = i_number def chanelItemPage(self, val): #self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback%s' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_division=FSH&i_project=fsh_v3&i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback&callback=localJsonpPricingCallback%s' self.serie_title, self.item_title, refers, self.item_name, self.item_url = val page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: self.item_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: self.item_name = m.group(1).split('-')[0].strip() m = re.search(r'<div class="productimage.*?"><img src="(.+?)" alt=".+?"/>', page, flags=re.S) if m: self.item_img = self.home_url + m.group(1) #m = re.search(r'<p class="size info">(.+?)</p>', page, flags=re.S) #if m: self.item_size = m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': self.item_size += '-' + size.group(1) else: self.item_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: # self.item_number = m.group(1) p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url % (ref_price, ref_price) data = self.crawler.getData(p_url, self.item_url) if not data or data == '': return # 抽取json报文 r = re.search(r'\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"]["currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: self.item_price += '-' + price else: if price: self.item_price = price if unit: self.item_unit = unit def diorItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: self.item_title = m.group(1).strip() m = re.search(r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)".*?/>', page, flags=re.S) if m: self.item_img = self.home_url + m.group(1) m = re.search(r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: s_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', s_desc, flags=re.S) if m: self.item_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', s_desc, flags=re.S) if m: self.item_size = m.group(1).strip() m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.+?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) self.item_number = s_number.split('-')[1].strip() def givenchyItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_number = val def armaniItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search(r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) else: m = re.search(r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, self.item_price = m.group(1), re.sub(r'<.*>','',m.group(2)) if currency.find("¥") != -1: self.item_unit = "CNY" else: self.item_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>','',m.group(1)) self.item_size = re.sub(r'\s+','',size_str) m = re.search(r'<h3 class="articleName"><span>.+?</span><span class="MFC">(.+?)</span></h3>', page, flags=re.S) if m: self.item_number = m.group(1) def bottegavenetaItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img = val # 先选国家 refers_page = self.crawler.getData(refers,'') if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search(r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) else: m = re.search(r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: self.item_img = m.group(1) self.item_size = '' m = re.search(r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: self.item_number = m.group(1) def louisvuittonItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div class="productName title" id="productName">\s*<h1 itemprop="name">(.+?)</h1>\s*</div>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search(r'<table class="priceButton">\s*<tr>\s*<td class="priceValue price-sheet">(.+?)</td>', page, flags=re.S) if m: self.item_price = m.group(1).replace('¥','').strip() if self.item_price.find("¥") != -1: self.item_unit = "CNY" m = re.search(r'<noscript>\s*<img src="(.+?)".+?itemprop="image".*?/>\s*</noscript', page, flags=re.S) if m: s_img = re.sub(r'\s+','',m.group(1)) self.item_img = s_img.replace('Frontview','Front%20view') m = re.search(r'<div class="textClientInfo exp_content".*?>\s*<div class="innerContent functional-text">(.+?)</div>', page, flags=re.S) if m: s_content = m.group(1).replace(' ','').strip() if s_content.find('宽)') != -1: s_size = s_content.split('宽)')[0] self.item_size = re.sub('<.+?>','',s_size) + "宽)" elif s_content.find('高)') != -1: s_size = s_content.split('高)')[0] self.item_size = re.sub('<.+?>','',s_size) + "高)" else: s_size = ''.join(s_content.split()) m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S) if m: self.item_number = m.group(1).strip() else: m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S) if m: self.item_number = m.group(1).strip() def dolcegabbanaItemPage(self, val): self.serie_title, self.item_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search(r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = re.sub(r'<.+?>','',s_price).replace('¥','').strip() m = re.search(r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: self.item_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: self.item_size = m.group(1) m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: self.item_number = m.group(1).split(':')[1].strip() def yslItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div id="itemInfo">.+?<h1><span class="customItemDescription" itemprop="name">(.+?)</span></h1>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search(r'<div id="itemPrice".+?><div.*?class="newprice">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() s_price = re.sub(r'<.+?>','',s_price) if s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥','').strip() else: self.item_price = s_price m = re.search(r'<div id="mainImageContainer"><img.+?src="(.+?)".*?/></div>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S) if m: self.item_size = m.group(1) m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S) if m: self.item_number = m.group(1) def bossItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: self.item_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥','').replace('¥','').strip() m = re.search(r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) self.item_size = s_size.split(':')[1] if self.item_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: self.item_size = re.sub(r'<.+?>','',m.group(1)) m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: self.item_number = m.group(1) def ferragamoItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: self.item_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥','').replace('¥','').strip() m = re.search(r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: self.item_size, self.item_number = m.group(1).strip(), m.group(2).strip() def antPage(self, val): if self.brand_type == 'chanel': self.chanelItemPage(val) elif self.brand_type == 'dior': self.diorItemPage(val) elif self.brand_type == 'givenchy': self.givenchyItemPage(val) elif self.brand_type == 'armani': self.armaniItemPage(val) elif self.brand_type == 'bottegaveneta': self.bottegavenetaItemPage(val) elif self.brand_type == 'louisvuitton': self.louisvuittonItemPage(val) elif self.brand_type == 'dolcegabbana': self.dolcegabbanaItemPage(val) elif self.brand_type == 'ysl': self.yslItemPage(val) elif self.brand_type == 'boss': self.bossItemPage(val) elif self.brand_type == 'ferragamo': self.ferragamoItemPage(val) def outItem(self): s = '%s|%s|%s|%s|%s|%s|%s|%s|%s' %(self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number) return s def outTuple(self): return (Common.time_s(self.crawling_time), self.brand_type, self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number, self.crawling_beginDate, self.crawling_beginHour)
class BagItem(): '''A class of bag''' def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = '' def initItem(self, serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number): self.serie_title = serie_title self.item_title = i_title self.item_name = i_name self.item_price = i_price self.item_unit = i_unit self.item_size = i_size self.item_url = i_url self.item_img = i_img self.item_number = i_number def chanelItemPage(self, val): #self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback%s' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_division=FSH&i_project=fsh_v3&i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback&callback=localJsonpPricingCallback%s' self.serie_title, self.item_title, refers, self.item_name, self.item_url = val page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search( r'<div class="product-details details".*?>.+?<p class="description info.*?">(.+?)</p>', page, flags=re.S) if m: self.item_name = re.sub(r'<.+?>', '', m.group(1)).strip() else: m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: self.item_name = m.group(1).split('-')[0].strip() m = re.search( r'<div class="productimage.*?"><img src="(.+?)" alt=".+?"/>', page, flags=re.S) if m: self.item_img = self.home_url + m.group(1) #m = re.search(r'<p class="size info">(.+?)</p>', page, flags=re.S) #if m: self.item_size = m.group(1) p = re.compile(r'<p class="size info">(.+?)</p>', flags=re.S) for size in p.finditer(page): if self.item_size != '': self.item_size += '-' + size.group(1) else: self.item_size = size.group(1) #m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) #if m: # self.item_number = m.group(1) p = re.compile(r'<div class="ref info">\s*<p>(.+?)</p>', flags=re.S) for number in p.finditer(page): item_number = number.group(1) if self.item_number != '': self.item_number += '-' + item_number else: self.item_number = item_number refs = item_number.split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url % (ref_price, ref_price) data = self.crawler.getData(p_url, self.item_url) if not data or data == '': return # 抽取json报文 r = re.search(r'\(\[(.+?)\]\)', data, flags=re.S) if r: price, unit = '', '' try: js_data = json.loads(r.group(1)) price, unit = js_data["price"]["amount"], js_data["price"][ "currency-symbol"] except Exception as e: m = re.search(r'"amount":"(.+?)"', data, flags=re.S) if m: price = m.group(1) m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: self.item_price += '-' + price else: if price: self.item_price = price if unit: self.item_unit = unit def diorItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h2 class="" itemprop="name">(.+?)<br />(.+?)</h2>', page, flags=re.S) if m: self.item_title = m.group(1).strip() m = re.search( r'<li class="firstThumbnails">\s+<a href="#" class="active".+?>\s+<img src="(.+?)".*?/>', page, flags=re.S) if m: self.item_img = self.home_url + m.group(1) m = re.search( r'<div class="modText">\s+<h4.+?>说明</h4>\s+<p>(.+?)</p>\s+</div>', page, flags=re.S) if m: s_desc = m.group(1) m = re.search(r'尺寸:(.+?)<br />', s_desc, flags=re.S) if m: self.item_size = m.group(1).strip() else: m = re.search(r'尺寸:(.+?)$', s_desc, flags=re.S) if m: self.item_size = m.group(1).strip() m = re.search( r'<div class="columns-wrapper">.+?<div class="column">.+?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) self.item_number = s_number.split('-')[1].strip() def givenchyItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_number = val def armaniItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h2 class="productName">(.+?)</h2>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search( r'<div id="zoomImageWrapper">\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) else: m = re.search( r'<div id="thumbsWrapper">.+?<div class="thumbElement".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search( r'<span class="currency">(.+?)</span>.*?<span class="priceValue">(.+?)</span>', page, flags=re.S) if m: currency, self.item_price = m.group(1), re.sub( r'<.*>', '', m.group(2)) if currency.find("¥") != -1: self.item_unit = "CNY" else: self.item_unit = currency m = re.search(r'<div class="attributes">(.+?)</div>', page, flags=re.S) if m: size_str = re.sub(r'<.*?>', '', m.group(1)) self.item_size = re.sub(r'\s+', '', size_str) m = re.search( r'<h3 class="articleName"><span>.+?</span><span class="MFC">(.+?)</span></h3>', page, flags=re.S) if m: self.item_number = m.group(1) def bottegavenetaItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img = val # 先选国家 refers_page = self.crawler.getData(refers, '') if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search( r'<h1 class="producTitle".+?>\s*<div class="modelName".+?>\s*<span class="modelName">(.+?)</span>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search( r'<div class="mainImage".+?>\s*<img.+?src="(.+?)".*?/>\s*</div>', page, flags=re.S) if m: self.item_img = m.group(1) else: m = re.search( r'<section id="bgItem">\s*<img.+?src="(.+?)".*?/>\s*</section>', page, flags=re.S) if m: self.item_img = m.group(1) self.item_size = '' m = re.search( r'<div class="localizedAttributes">.*?<div class="height">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="depth">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="length_of_strap">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r'<div class="localizedAttributes">.*?<div class="width">.+?<span class="text">(.+?)</span>\s*<span class="value">(.+?)</span>\s*</div>\s*</div>', page, flags=re.S) if m: self.item_size += m.group(1) + ":" + m.group(2) + ";" m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: self.item_number = m.group(1) def louisvuittonItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search( r'<div class="productName title" id="productName">\s*<h1 itemprop="name">(.+?)</h1>\s*</div>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search( r'<table class="priceButton">\s*<tr>\s*<td class="priceValue price-sheet">(.+?)</td>', page, flags=re.S) if m: self.item_price = m.group(1).replace('¥', '').strip() if self.item_price.find("¥") != -1: self.item_unit = "CNY" m = re.search( r'<noscript>\s*<img src="(.+?)".+?itemprop="image".*?/>\s*</noscript', page, flags=re.S) if m: s_img = re.sub(r'\s+', '', m.group(1)) self.item_img = s_img.replace('Frontview', 'Front%20view') m = re.search( r'<div class="textClientInfo exp_content".*?>\s*<div class="innerContent functional-text">(.+?)</div>', page, flags=re.S) if m: s_content = m.group(1).replace(' ', '').strip() if s_content.find('宽)') != -1: s_size = s_content.split('宽)')[0] self.item_size = re.sub('<.+?>', '', s_size) + "宽)" elif s_content.find('高)') != -1: s_size = s_content.split('高)')[0] self.item_size = re.sub('<.+?>', '', s_size) + "高)" else: s_size = ''.join(s_content.split()) m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S) if m: self.item_number = m.group(1).strip() else: m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S) if m: self.item_number = m.group(1).strip() def dolcegabbanaItemPage(self, val): self.serie_title, self.item_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div id="itemTechSheet">\s*<h1>(.+?)</h1>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search( r'<div id="itemTechSheet">.+?<div class="price">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = re.sub(r'<.+?>', '', s_price).replace('¥', '').strip() m = re.search( r'<div id="itemImagesBox".*?>\s*<img.+?class="mainImage" src="(.+?)">', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺寸.+?)</li>', page, flags=re.S) if m: self.item_size = m.group(1) else: m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: self.item_size = m.group(1) m = re.search( r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: self.item_number = m.group(1).split(':')[1].strip() def yslItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search( r'<div id="itemInfo">.+?<h1><span class="customItemDescription" itemprop="name">(.+?)</span></h1>', page, flags=re.S) if m: self.item_name = m.group(1).strip() m = re.search( r'<div id="itemPrice".+?><div.*?class="newprice">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() s_price = re.sub(r'<.+?>', '', s_price) if s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥', '').strip() else: self.item_price = s_price m = re.search( r'<div id="mainImageContainer"><img.+?src="(.+?)".*?/></div>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search( r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S) if m: self.item_size = m.group(1) m = re.search( r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S) if m: self.item_number = m.group(1) def bossItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<h1 class="product-name">(.+?)</h1>', page, flags=re.S) if m: self.item_name = ' '.join(m.group(1).strip().split()) m = re.search( r'<div class="product-prices">.+?<dd class="saleprice">(.+?)</dd>.+?</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="image">.+?<div class="container".*?>\s*<table.+?>\s*<tr>\s*<td>\s*<a.+?>\s*<img.+?class="thumb".+?big="(.+?)".*?/>\s*</a>\s*</td>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search( r'<div class="tabpage inc".+?>.+?<span.*?>(尺寸大小.+?)</span>', page, flags=re.S) if m: s_size = m.group(1) self.item_size = s_size.split(':')[1] if self.item_size == '': m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: self.item_size = re.sub(r'<.+?>', '', m.group(1)) m = re.search( r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: self.item_number = m.group(1) def ferragamoItemPage(self, val): self.serie_title, refers, self.item_name, self.item_url, self.item_img, self.item_price, self.item_unit = val if self.item_url == '': return page = self.crawler.getData(self.item_url, refers) if not page or page == '': return m = re.search(r'<div class="product-title">(.+?)</div>', page, flags=re.S) if m: self.item_name = ' '.join(m.group(1).strip().split()) m = re.search(r'<div class="product-prices">(.+?)</div>', page, flags=re.S) if m: s_price = m.group(1).strip() if s_price.find("¥") != -1 or s_price.find("¥") != -1: self.item_unit = "CNY" self.item_price = s_price.replace('¥', '').replace('¥', '').strip() m = re.search( r'<div class="item-list">\s*<ul.+?>\s*<li class="first">.+?<img.+?src="(.+?)".*?/>', page, flags=re.S) if m: self.item_img = m.group(1) m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: self.item_size, self.item_number = m.group(1).strip(), m.group( 2).strip() def antPage(self, val): if self.brand_type == 'chanel': self.chanelItemPage(val) elif self.brand_type == 'dior': self.diorItemPage(val) elif self.brand_type == 'givenchy': self.givenchyItemPage(val) elif self.brand_type == 'armani': self.armaniItemPage(val) elif self.brand_type == 'bottegaveneta': self.bottegavenetaItemPage(val) elif self.brand_type == 'louisvuitton': self.louisvuittonItemPage(val) elif self.brand_type == 'dolcegabbana': self.dolcegabbanaItemPage(val) elif self.brand_type == 'ysl': self.yslItemPage(val) elif self.brand_type == 'boss': self.bossItemPage(val) elif self.brand_type == 'ferragamo': self.ferragamoItemPage(val) def outItem(self): s = '%s|%s|%s|%s|%s|%s|%s|%s|%s' % ( self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number) return s def outTuple(self): return (Common.time_s(self.crawling_time), self.brand_type, self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number, self.crawling_beginDate, self.crawling_beginHour)
class ChanelBag(): '''A class of chanel bag''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # 品牌官网链接 self.home_url = 'http://www.chanel.com' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback' self.refers = None # 抓取商品列表 self.links = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return m = re.search(r'"products":\s*(\[.+?\])\s*},\s*"deeplink"', page, flags=re.S) if m: prods = m.group(1) js_items = json.loads(prods) for js_item in js_items: serie_title = js_item["title"] serie_items = js_item["items"] for serie_item in serie_items: if not serie_item.has_key("title"): continue i_title = serie_item["title"] i_url = self.home_url + serie_item["href"] self.links.append((serie_title, i_title, i_url)) print '# bagPage :', serie_title, i_title, i_url def bagItems(self): for link in self.links: self.itemPage(link) def itemPage(self, val): serie_title, i_title, i_url = val page = self.crawler.getData(i_url, self.refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit = '', '', '', '', '', '' m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1) m = re.search( r'<div class="productimage.*?"><img src="(.+?)" alt=".+?"/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search(r'<p class="size info">(.+?)</p>', page, flags=re.S) if m: i_size = m.group(1) m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) if m: refs = m.group(1).split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url % ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: js_data = json.loads(r.group(1)) i_price = js_data["price"]["amount"] i_unit = js_data["price"]["currency-symbol"] s = '%s|%s|%s|%s|%s|%s|%s|%s' % (serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img) self.items.append(s) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))
class ChanelBag(): '''A class of chanel bag''' def __init__(self): # 抓取设置 self.crawler = MyCrawler() # 品牌官网链接 self.home_url = 'http://www.chanel.com' self.price_url = 'http://ws.chanel.com/pricing/zh_CN/fashion/%s/?i_client_interface=fsh_v3_misc&i_locale=zh_CN&format=json&callback=localJsonpPricingCallback' self.refers = None # 抓取商品列表 self.links = [] self.items = [] def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return m = re.search(r'"products":\s*(\[.+?\])\s*},\s*"deeplink"', page, flags=re.S) if m: prods = m.group(1) js_items= json.loads(prods) for js_item in js_items: serie_title = js_item["title"] serie_items = js_item["items"] for serie_item in serie_items: if not serie_item.has_key("title"): continue i_title = serie_item["title"] i_url = self.home_url + serie_item["href"] self.links.append((serie_title, i_title, i_url)) print '# bagPage :', serie_title, i_title, i_url def bagItems(self): for link in self.links: self.itemPage(link) def itemPage(self, val): serie_title, i_title, i_url = val page = self.crawler.getData(i_url, self.refers) if not page or page == '': return i_name, i_img, ref_price, i_size, i_price, i_unit = '', '', '', '', '', '' m = re.search(r'<title>(.+?)</title>', page, flags=re.S) if m: i_name = m.group(1) m = re.search(r'<div class="productimage.*?"><img src="(.+?)" alt=".+?"/>', page, flags=re.S) if m: i_img = self.home_url + m.group(1) m = re.search(r'<p class="size info">(.+?)</p>', page, flags=re.S) if m: i_size = m.group(1) m = re.search(r'<div class="ref info">\s*<p>(.+?)</p>', page, flags=re.S) if m: refs = m.group(1).split(' ')[:-1] ref_price = ''.join(refs) p_url = self.price_url %ref_price data = self.crawler.getData(p_url, i_url) if not data or data == '': return # 抽取json报文 r = re.search(r'localJsonpPricingCallback\(\[(.+?)\]\)', data, flags=re.S) if r: js_data = json.loads(r.group(1)) i_price = js_data["price"]["amount"] i_unit = js_data["price"]["currency-symbol"] s = '%s|%s|%s|%s|%s|%s|%s|%s' %(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img) self.items.append(s) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items))