def crawl(self): while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e break _val = _data[1] item = BagItem(self.home_url, self.brand_type) item.antPage(_val) self.push_back(self.items, item.outItem()) sql = item.outTuple() self.mysqlAccess.insert_item(sql) # 延时 time.sleep(0.1) # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() time.sleep(5)
def checkPage(self, url, data): # 异常处理1: 网站deny页 m = re.search(r'<TITLE>403拒绝访问</TITLE>', data) if m: e = "# Deny page: 403拒绝访问错误, url=%s" %url raise Common.DenypageException(e) # 异常处理2: 页面不存在 m = re.search(r'<div class=".+?">很抱歉,您查看的页面找不到了!</div>', data) if m: e = "# No page: 很抱歉,您查看的页面找不到了!, url=%s" %url raise Common.NoPageException(e)
def initItem(self): # 商品抓取设置 self.crawling_time = Common.now() self.crawling_begintime = '' # 本次抓取开始时间 self.crawling_beginDate = '' # 本次爬取日期 self.crawling_beginHour = '' # 本次爬取小时 # 商品属性 self.item_id = '' # 商品ID self.item_name = '' # 商品名称 self.item_price = '' # 商品价格 self.item_url = '' # 商品链接 self.item_spuId = '' # SPU ID self.item_sellCount = 0 # 月销售数 self.brand_name = '' self.brand_id = '' self.category_id = '' # 商品页 self.item_page = None # 商品首页 # item html urls self.item_urls = [] # 商品链接列表 # item html pages #self.item_pages = [] # 商品网页列表 self.item_pages = {} # 商品网页列表 # 成交记录 self.deal_url = '' self.deal_stopCrawl = False self.deal_deadLine = 0.0 # 上次抓取的成交记录最晚时间 self.deal_deadLine2 = 0.0 # 本次抓取的成交记录最早时间
def outItemSql(self): return (Common.time_s(self.crawling_time), self.item_id, self.item_name, self.item_price, self.item_sellCount, self.item_url, self.seller_id, self.seller_name, self.shop_id, self.shop_name, self.shop_url, self.brand_id, self.brand_name, self.category_id, self.crawling_beginDate, self.crawling_beginHour)
def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime( "%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime( "%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = ''
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list_info = '' m = re.search(r'<li class="mega-menu-item">\s*<div.+?data-megaMenu="women".*?>\s*<span class="onlyML">手袋</span>.+?</div>\s*<div class="mega-menu-content">.+?<ul class="level3">(.+?)</ul>', page, flags=re.S) if m: tab_list_info = m.group(1).strip() tab_list = [] p = re.compile(r'<li class="mm-block">.+?<a.+?href="(.+?)">\s*<p class="mm-push-p">(.+?)</p>\s*</a>.+?</li>', flags=re.S) for tab_info in p.finditer(tab_list_info): tab_list.append((tab_info.group(2).strip(),self.home_url+tab_info.group(1))) print tab_info.group(2).strip(),self.home_url+tab_info.group(1) i = 0 for tab in tab_list: refers = url tab_name, tab_url = tab print '# tab:',tab_name, tab_url tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) while m: refers = tab_url tab_url = re.sub(r'/to-\d+', '', tab_url) + "/to-%s"%m.group(1) tab_page = self.crawler.getData(tab_url, refers) m = re.search(r'<button.+?class="pl-next".+?data-next="(.+?)">',tab_page,flags=re.S) p = re.compile(r'<a id="sku_.*?" href="(.+?)".+?>.+?<div class="description">\s*<div class="productName toMinimize">(.+?)</div>\s*<div class="productPrice.+?data-htmlContent="(.+?)">\s*</div>\s*</div>', flags=re.S) for item in p.finditer(tab_page): i_url, i_name, s_price = item.group(1),item.group(2),item.group(3) print self.home_url+i_url, i_name, s_price i_unit = "" if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name): self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_price,i_unit)) else: if Common.isBag(i_name): i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, '') self.items.append(i.outItem())
def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException("# TMItem: not find item page,itemid:%s,item_url:%s"%(str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search(r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url(m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str(int(time.time()*1000)) initapi_url = Common.fix_url(m.group(1).strip()) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1)
def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search( r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile( r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>' ) j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group( 1), item.group(2), Common.htmlDecode( item.group(3).strip()), item.group( 4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:', item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y', '&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&', '&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url)
def getPage(self, url): position = 1 i = 1 i_url = url refers = self.home_url max_page = 10 size_page = 48 while i <= max_page: page = self.crawler.getData(i_url, refers) refers = i_url i_url = url + '&bcoffset=1&s=%s' % str(i*size_page) i += 1 if not page or page == '': print 'not find data url:',i_url time.sleep(4) continue m = re.search(r'<script>\s+g_page_config = ({.+?});.+?</script>', page, flags=re.S) if m: page_config = m.group(1) page_config_s = re.sub(r'\n+','',page_config) data = json.loads(page_config_s) if data.has_key("mods"): if data["mods"].has_key("itemlist"): itemlist = data["mods"]["itemlist"] if itemlist.has_key("data"): itemlist_data = itemlist["data"] if itemlist_data.has_key("auctions"): for item in itemlist_data["auctions"]: item_id = position m = re.search(r'id=(\d+)', item["detail_url"], flags=re.S) if m: item_id = m.group(1) item_sales = item["view_sales"] m = re.search(r'(\d+)', item["view_sales"], flags=re.S) if m: item_sales = m.group(1) print Common.time_s(Common.now()), position, item_id, item["raw_title"], item["view_price"], item_sales, item["user_id"], item["nick"], "http:" + item["detail_url"], "http:" + item["shopLink"] self.mysqlAccess.insert_item((Common.time_s(Common.now()), str(item_id), str(position), str(item["raw_title"]), str(item["view_price"]), str(item_sales), "http:" + item["detail_url"], item["user_id"], str(item["nick"]), "http:" + item["shopLink"])) position += 1 time.sleep(4)
def run_items(self, items_info, tab_name, tab_url): p = re.compile(r'<.+?>\s*<a.+?>\s*<img.+?src="(.+?)".*?>\s*</a><a href="(.+?)">.+?<div class="infoDescription">(.+?)</div>\s*(<div class="infoPrice">.+?</div>).+?</a>\s*<.+?>', flags=re.S) for item in p.finditer(items_info): i_img, i_url, s_name, price_info = item.group(1),item.group(2),item.group(3),item.group(4) i_name = re.sub(r'<.+?>','',s_name) i_price, i_unit = '', '' m = re.search(r'<div.+?class="newprice">(.+?)</div>', price_info, flags=re.S) if m: s_price = re.sub(r'<.+?>','',m.group(1)) if s_price.find("¥") != -1: i_unit = "CNY" i_price = s_price.replace('¥','').strip() if i_url and i_url != '': if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit self.link_list.append((tab_name,tab_url,i_name,self.home_url+i_url,i_img,i_price,i_unit)) else: if Common.isBag(i_name) or Common.isBag(unquote(i_url)): print self.home_url+i_url, i_img, i_name, i_price, i_unit i = BagItem(self.brand_type) i.initItem(tab_name, '', i_name, i_price, i_unit, '', i_url, i_img) self.items.append(i.outItem())
def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int(time.time()*1000)) + '_' + str(random.randint(100,999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s
def TMItem(self): if self.item_url != '': page = self.crawler.getData(self.item_url, self.refers) if not page or page == '': raise Common.InvalidPageException( "# TMItem: not find item page,itemid:%s,item_url:%s" % (str(self.item_id), self.item_url)) m = re.search(r'sellerId:"(\d+)",', page, flags=re.S) if m: self.seller_id = m.group(1) m = re.search(r'shopId:"(\d+)",', page, flags=re.S) if m: self.shop_id = m.group(1) m = re.search( r'<div class="slogo">\s*<a class="slogo-shopname" href="(.+?)".+?><strong>(.+?)</strong></a>', page, flags=re.S) if m: self.shop_url, self.shop_name = Common.fix_url( m.group(1)), m.group(2).strip() m = re.search(r'TShop\.Setup\((.+?)\);', page, flags=re.S) if m: TShop_s = m.group(1).strip() m = re.search(r'"brand":"(.+?)",', TShop_s, flags=re.S) if m: self.brand_name = Common.htmlDecode(m.group(1).strip()) m = re.search(r'"brandId":"(\d+)",', TShop_s, flags=re.S) if m: self.brand_id = m.group(1) m = re.search(r'"categoryId":"(\d+)",', TShop_s, flags=re.S) if m: self.category_id = m.group(1) m = re.search(r'"sellerNickName":"(.+?)",', TShop_s, flags=re.S) if m: self.seller_name = Common.urlDecode(m.group(1).strip()) m = re.search(r'"initApi":"(.+?)",', TShop_s, flags=re.S) if m: ts = "&callback=setMdskip×tamp=%s" % str( int(time.time() * 1000)) initapi_url = Common.fix_url(m.group(1).strip( )) + ts + "&ref=%s" % Common.urlCode(self.refers) init_page = self.crawler.getData(initapi_url, self.item_url) if not init_page and init_page == '': print '# init page is null..' else: m = re.search(r'"sellCountDO":{"sellCount":(\d+),', init_page, flags=re.S) if m: self.item_sellCount = m.group(1)
def get_asyn_data(self, i_url, refers, shop_home_url): result = '' result_s = '' page = self.crawler.getData(i_url, refers) m = re.search(r'<input id="J_ShopAsynSearchURL".+?value="(.+?)"\s*/>', page, flags=re.S) if m: ts = '?_ksTS=%s&callback=jsonp135&' % (str(int( time.time() * 1000)) + '_' + str(random.randint(100, 999))) a_url = shop_home_url + Common.htmlDecode(m.group(1)) asyn_url = re.sub('\?', ts, a_url) result = self.crawler.getData(asyn_url, i_url) m = re.search(r'jsonp135\("(.+?)"\)', result, flags=re.S) if m: result_s = re.sub(r'\\"', '"', m.group(1)) return result_s
def __init__(self): # 抓取设置 #self.crawler = MyCrawler() self.crawler = RetryCrawler() # db self.mysqlAccess = MysqlAccess() # mysql access # 品牌官网链接 self.home_url = 'http://www.taobao.com' self.refers = None # 抓取商品列表 self.link_list = [] self.items = [] self.begin_time = Common.now()
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile(r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append((tab.group(2)+tab.group(3).strip(),tab.group(1))) for tab in tab_list: tab_name,tab_url = tab print '# tab:',tab_name,tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search(r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group(1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/","/") #print data_html #break p = re.compile(r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url+item.group(3), item.group(4), item.group(2), re.sub(r'<.+?>','',item.group(1)).strip() i_number = '' m = re.search(r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append((tab_name, tab_url, i_name, i_url, i_img, i_number))
def __init__(self, home_url, brand_type): # 抓取设置 self.crawler = MyCrawler() self.crawling_time = Common.now() # 当前爬取时间 self.crawling_beginDate = time.strftime("%Y-%m-%d", time.localtime(self.crawling_time)) # 本次爬取日期 self.crawling_beginHour = time.strftime("%H", time.localtime(self.crawling_time)) # 本次爬取小时 # 品牌官网链接 self.home_url = home_url # 品牌type self.brand_type = brand_type self.serie_title = '' self.item_title = '' self.item_name = '' self.item_price = '' self.item_unit = '' self.item_size = '' self.item_url = '' self.item_img = '' self.item_number = ''
def getData(self, url, refers='', decode=True, terminal='1'): # when null url, exit function if not url or not re.search(r'http://', url): return None # To build header _header = self.buildHeader(refers, terminal) # To forge vip cookie _cookie = self.session_cookie if self.use_cookie else self.crawl_cookie # 打开连接收取数据 r = self.crawler.session.get(url, headers=_header, cookies=_cookie, timeout=self.timeout) # 网页内容 data = r.content # 检查是否重定向 self.forward = (len(r.history) > 0) # 跟踪cookie if not self.use_cookie and len(r.cookies) > 0: self.crawl_cookie = Common.cookieJar2Dict(r.cookies) # 网页编码 self.f_coder = self.charset(r.headers.get('content-type')) # 关闭结果 r.close() # 网页编码归一化 if decode and self.f_coder != self.t_coder: data = data.decode(self.f_coder,'ignore').encode(self.t_coder,'ignore') # pc/wap网页异常 if terminal in ['1', '2']: self.checkPage(url, data) # 返回抓取结果 return data
def insert_item(self, args): try: sql = 'replace into nd_tb_parser_item(crawl_time,item_id,position,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_url) values(%s)' % Common.aggregate(10) self.db.execute(sql, args) except Exception, e: print '# insert tb item exception:', e
def insert_parser_item_info(self, args): try: sql = 'replace into nd_tb_parser_item_info(crawl_time,item_id,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_id,shop_name,shop_url,brand_id,brand_name,category_id,c_begindate,c_beginhour) values(%s)' % Common.aggregate(16) self.db.execute(sql, args) except Exception, e: print '# insert tb shop item exception:', e
def insert_item(self, args): try: sql = 'replace into nd_brand_parser_item(crawl_time,brand_name,serie_title,item_type,item_name,item_price,item_unit,item_size,item_url,item_img_url,item_number,c_begindate,c_beginhour) values(%s)' % Common.aggregate(13) self.brand_db.execute(sql, args) except Exception, e: print '# insert brand item exception:', e
def getPage(self, url, shop_home_url): position = 1 i = 1 max_page = 0 asyn_url = '' i_url = url refers = shop_home_url result_s = self.get_asyn_data(i_url, refers, shop_home_url) m = re.search(r'<b class="ui-page-s-len">\d+/(\d+)</b>', result_s, flags=re.S) if m: max_page = int(m.group(1)) print '# page num:', max_page while i <= max_page: m = re.search(r'<div class="J_TItems">(.+?)<div class="pagination">', result_s, flags=re.S) if m: items_s = m.group(1) p = re.compile(r'<dl class=".+?".+?data-id="(.+?)">.+?<dd class="detail">\s*<a class="item-name".+?href="(.+?)".+?>(.+?)</a>\s*<div class="attribute">\s*<div class="cprice-area">\s*<span class="symbol">(.+?)</span>\s*<span\s*class="c-price">(.+?)</span>\s*</div>.+?</dl>') j = 1 for item in p.finditer(items_s): item_id, url_s, item_name, price_symbol, price = item.group(1), item.group(2), Common.htmlDecode(item.group(3).strip()), item.group(4).strip(), item.group(5).strip() if url_s.find('http') == -1: item_url = 'http:' + url_s else: item_url = url_s print '### item ###' print '# item val:', item_id, item_name, price, item_url item = Item() item.parserTM((item_id, item_name, price, item_url, i_url, self.begin_time)) print '# item info:',item.outItemSql() self.mysqlAccess.insert_parser_item_info(item.outItemSql()) time.sleep(2) refers = i_url if i_url.find('pageNo=') == -1: i_url = re.sub(r'&tsearch=y','&pageNo=%d&tsearch=y#anchor' % i, refers) else: i_url = re.sub(r'&pageNo=\d+&','&pageNo=%d&' % i, refers) i += 1 time.sleep(2) result_s = self.get_asyn_data(i_url, refers, shop_home_url)
r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = BottegavenetaBag() b_url = "http://www.bottegaveneta.com/wy/%E5%A5%B3%E5%A3%AB/onlineboutique/%E6%89%8B%E8%A2%8B" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'bottegaveneta_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
m = re.search( r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = BossBag() b_url = "http://store.hugoboss.cn/category.php?id=3835&form_nav" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'boss_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = GivenchyBag() b_url = 'http://www.givenchy.com/cn/' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'givenchy_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
i_number = '' m = re.search( r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = ArmaniBag() b.bagPage() b.bagItems() f = Config.dataPath + 'armani_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
#i_size = "".join(size_str.split()) i_size = re.sub(r'\s*','',size_str) print "".join(i_size.split()) i_number = '' m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem() i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) #print '# itemPage :', serie_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = ArmaniBag() b.bagPage() b.bagItems() f = Config.dataPath + 'armani_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
self.crawler = MyCrawler() # 品牌官网链接 self.home_url = 'http://www.mcmworldwide.com' self.women_url = self.home_url + '/en/women' self.bag_url = self.women_url + '/bags' self.backpack_url = self.women_url + '/backpacks' self.leather_url = self.women_url + '/small-leather-goods' self.refers = None # 抓取商品列表 self.links = [] self.items = [] def bagPage(self): url = self.bug_url + '#start=0&sz=32&srule=New' page = self.crawler.getData(self.bag_url, self.women_url) if not page or page == '': return if __name__ == '__main__': b = ChanelBag() b_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags/g.spring-summer-2015.c.15S.html' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'chanel_%s.txt' % Common.today_ss() print f b.outItems(f)
m = re.search(r'<h2 class="sku reading-and-link-text">(.+?)</h2>', page, flags=re.S) if m: i_number = m.group(1).strip() else: m = re.search(r'<meta itemprop="identifier" content="sku:(.+?)"/>', page, flags=re.S) if m: i_number = m.group(1).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = LouisvuittonBag() b_url = "http://www.louisvuitton.cn/zhs-cn/homepage" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'louisvuitton_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
m = re.search(r'<div class="itemDimensions">.+?<span class="dimensions">(.+?)</span></div>', page, flags=re.S) if m: i_size = m.group(1) i_number m = re.search(r'<div class="styleIdDescription">货号.+?<span.*?>(.+?)</span></div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = YslBag() b_url = "http://www.ysl.com/wy/shop-product/%E5%A5%B3%E5%A3%AB" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'ysl_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def insert_parser_item_info(self, args): try: sql = 'replace into nd_tb_parser_item_info(crawl_time,item_id,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_id,shop_name,shop_url,brand_id,brand_name,category_id,c_begindate,c_beginhour) values(%s)' % Common.aggregate( 16) self.db.execute(sql, args) except Exception, e: print '# insert tb shop item exception:', e
m = re.search(r'"currency-symbol":"(.+?)"', data, flags=re.S) if m: unit = m.group(1) if self.item_price != '': if price: i_price += '-' + price else: if price: i_price = price if unit: i_unit = unit i = BagItem(self.brand_type) i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) #print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = ChanelBag() b_url = 'http://www.chanel.com/zh_CN/fashion/products/handbags.html' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'chanel_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def insert_item(self, args): try: sql = 'replace into nd_tb_parser_item(crawl_time,item_id,position,item_name,item_price,item_sale,item_url,seller_id,seller_name,shop_url) values(%s)' % Common.aggregate( 10) self.db.execute(sql, args) except Exception, e: print '# insert tb item exception:', e
def bagItems(self): #for link in self.link_list: self.itemPage(link) max_th = 10 if len(self.link_list) > max_th: m_itemsObj = BagItemM(self.home_url, self.brand_type, max_th) else: m_itemsObj = BagItemM(self.home_url, self.brand_type, len(self.link_list)) m_itemsObj.createthread() m_itemsObj.putItems(self.link_list) m_itemsObj.run() self.items.extend(m_itemsObj.items) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = GivenchyBag() b_url = 'http://www.givenchy.com/cn/' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'givenchy_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def outTuple(self): return (Common.time_s(self.crawling_time), self.brand_type, self.serie_title, self.item_title, self.item_name, self.item_price, self.item_unit, self.item_size, self.item_url, self.item_img, self.item_number, self.crawling_beginDate, self.crawling_beginHour)
m = re.search(r'<span.+?>尺寸大小:</span>(.+?)</span>', page, flags=re.S) if m: i_size = re.sub(r'<.+?>','',m.group(1)) i_number = '' m = re.search(r'<div class="base">\s*<div class="sku-brand">.+?<dl class="hidden"><dt>商品货号: </dt><dd>(.+?)</dd></dl>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = BossBag() b_url = "http://store.hugoboss.cn/category.php?id=3835&form_nav" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'boss_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
m = re.search( r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = DolcegabbanaBag() b_url = "http://www.dolcegabbana.com.cn/cn/dolce-gabbana/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'dolcegabbana_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
i_size += m.group(1) + ":" + m.group(2) + ";" i_number m = re.search(r' <div class="modelFabricColorWrapper">\s*<div class="inner".*?>\s*<span class="modelTitle">.+?</span>.+?<span.*?class="value">(.+?)</span>\s*</div>\s*</div>\s*</div>', page, flags=re.S) if m: i_number = m.group(1) i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, '', '', i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) #print '# itemPage :', serie_title, i_name, i_url, i_img, i_size def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = BottegavenetaBag() b_url = "http://www.bottegaveneta.com/wy/%E5%A5%B3%E5%A3%AB/onlineboutique/%E6%89%8B%E8%A2%8B" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'bottegaveneta_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = DiorBag() b_url = 'http://www.dior.cn/couture/zh_cn/%E5%A5%B3%E5%A3%AB%E6%97%B6%E8%A3%85/%E7%9A%AE%E5%85%B7' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'dior_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def bagPage(self, url): page = self.crawler.getData(url, self.home_url) if not page or page == '': return tab_list = [] m = re.search(r'<p>女装配饰</p>\s+<ul class="menu-level-4">(.+?)</ul>', page, flags=re.S) if m: tabs_list_info = m.group(1) p = re.compile( r'<li class="submenu-item">\s+<a.+?href="(.+?)">\s+<span lang="fr" class="lang-fr">(.+?)</span>(.+?)</a>\s+</li>', flags=re.S) for tab in p.finditer(tabs_list_info): tab_list.append( (tab.group(2) + tab.group(3).strip(), tab.group(1))) for tab in tab_list: tab_name, tab_url = tab print '# tab:', tab_name, tab_url tab_page = self.crawler.getData(tab_url, url) m = re.search( r'<div id="layer-1".+?>\s+<a href="(.+?)" class="layer-links">.+?</a>', tab_page, flags=re.S) if m: ajax_url = self.home_url + m.group( 1) + "?ajax=true&fragment=true" ajax_data = self.crawler.getData(ajax_url, tab_url) if ajax_data: #data = json.loads(ajax_data) #if data and data.has_key("html"): # print data["html"].decode("unicode-escape") r_data = ajax_data.decode("unicode-escape") if r_data: m = re.search(r'"html":"(.+?)"}', r_data, flags=re.S) if m: data_html = m.group(1).replace("\/", "/") #print data_html #break p = re.compile( r'<li class="lookbook-item line" data-idlook="\d+">\s+<div class="disp-n">.+?<div class="look-info article">\s+<p>(.+?)</p>.+?<p class="look-ref">(.+?)</p>.+?</div>.+?</div>\s+<a href="(.+?)".+?>.+?<img .+?data-src="(.+?)".*?/>\s+</li>', flags=re.S) for item in p.finditer(data_html): i_url, i_img, s_number, i_name = self.home_url + item.group( 3), item.group(4), item.group(2), re.sub( r'<.+?>', '', item.group(1)).strip() i_number = '' m = re.search( r'<span class="look-ref-sku">\s*<span.+?>(.+?)</span>\s*</span>', s_number, flags=re.S) if m: i_number = m.group(1) print i_url, i_img, i_name, i_number if Common.isBag(i_name): self.link_list.append( (tab_name, tab_url, i_name, i_url, i_img, i_number))
def outItemSql(self): return (Common.time_s(self.crawling_time),self.item_id,self.item_name,self.item_price,self.item_sellCount,self.item_url,self.seller_id,self.seller_name,self.shop_id,self.shop_name,self.shop_url,self.brand_id,self.brand_name,self.category_id,self.crawling_beginDate,self.crawling_beginHour)
i_number = '' m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, '', i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:', i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = FerragamoBag() b_url = "http://www.ferragamo.cn/woman/handbags/" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'ferragamo_%s.txt' % Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
i_img = m.group(1) i_size = "" i_number = "" m = re.search(r'<div class="product-code">(.+?)型号代码(.+?)</div>', page, flags=re.S) if m: i_size, i_number = m.group(1).strip(), m.group(2).strip() i = BagItem(self.brand_type) i.initItem(serie_title, "", i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print "# itemPage:", i.outItem() # self.items.append(i.outItem()) def outItems(self, f): s = "#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号" with open(f, "w") as f_item: self.items.insert(0, s) f_item.write("\n".join(self.items)) if __name__ == "__main__": print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) b = FerragamoBag() b_url = "http://www.ferragamo.cn/woman/handbags/" b.bagPage(b_url) b.bagItems() f = Config.dataPath + "ferragamo_%s.txt" % Common.today_ss() b.outItems(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
m = re.search(r'<div class="scrollCnt">\s*<ul>.+?<li>(尺码.+?)</li>', page, flags=re.S) if m: i_size = m.group(1) i_number = '' m = re.search(r'<div id="itemTechSheet">.+?<p class="prodCode">(.+?)</p>', page, flags=re.S) if m: i_number = m.group(1).split(':')[1].strip() i = BagItem(self.brand_type) i.initItem('', item_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) print '# itemPage:',i.outItem() #self.items.append(i.outItem()) def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = DolcegabbanaBag() b_url = "http://www.dolcegabbana.com.cn/cn/dolce-gabbana/%E5%A5%B3%E5%A3%AB/onlinestore/%E5%8C%85%E8%A2%8B" b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'dolcegabbana_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
if m: i_size = m.group(1).strip() i_number = '' m = re.search(r'<div class="columns-wrapper">.+?<div class="column">.*?<div class="reference">\s*<p>(.+?)</p>\s*</div>', page, flags=re.S) if m: s_number = m.group(1) i_number = s_number.split('-')[1].strip() i = BagItem() i.initItem(serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img, i_number) self.items.append(i.outItem) print '# itemPage :', serie_title, i_title, i_name, i_price, i_unit, i_size, i_url, i_img def outItems(self, f): s = '#系列名称|商品标签|商品名称|商品价格|金额单位|商品尺寸|商品链接|商品图片|商品编号' with open(f, 'w') as f_item: self.items.insert(0, s) f_item.write('\n'.join(self.items)) if __name__ == '__main__': print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) b = DiorBag() b_url = 'http://www.dior.cn/couture/zh_cn/%E5%A5%B3%E5%A3%AB%E6%97%B6%E8%A3%85/%E7%9A%AE%E5%85%B7' b.bagPage(b_url) b.bagItems() f = Config.dataPath + 'dior_%s.txt' %Common.today_ss() b.outItems(f) print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))