class JHSGroupItemWorker(): '''A class of JHS group item channel worker''' def __init__(self): # jhs group item type self.worker_type = Config.JHS_GroupItem self.jhs_type = Config.JHS_TYPE # queue type # message self.message = Message() # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() # DB # mysql access self.mysqlAccess = MysqlAccess() # redis queue self.redisQueue = RedisQueue() # redis access self.redisAccess = RedisAccess() # mongodb fs access self.mongofsAccess = MongofsAccess() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e def push_back_list(self, L, v): L.extend(v) def push_back_val(self, L, v): L.append(v) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg def crawlPage(self, _key, msg, _val): try: if self._obj == 'groupitemcat': self.run_category(msg, _val) else: print '# crawlPage unknown obj = %s' % self._obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_category(self, msg, _val): category_val = msg["val"] refers = _val c_url,c_name,c_id = category_val print c_url,c_name,c_id page = self.crawler.getData(c_url, refers) page_val = (page,c_name,c_id) ajax_url_list = self.getAjaxurlList(page_val,c_url) if len(ajax_url_list) > 0: self.get_jsonitems(ajax_url_list) # get json ajax url def getAjaxurlList(self, page_val, refers=''): url_list = [] page, c_name, c_id = page_val p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = c_name a_url = a_info.group(1).replace('amp;','') info = a_info.group(2) m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S) if m: c_subNav = m.group(1).strip() a_val = (c_id,c_name,refers,c_subNav) url_list.append((a_url,refers,a_val)) i += 1 return url_list # get item json list in category page from ajax url def get_jsonitems(self, ajax_url_list): # today all items val todayall_item_val = [] # other sub nav items val item_list = [] # process ajax url list item_json_index = 0 # mongo json pages cat_pages = {} for a_url in ajax_url_list: # get json from ajax url Result_list = self.jsonpage.get_json([a_url]) # mongo page json _url,_refers,_val = a_url _c_id = _val[0] time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) # timeStr_jhstype_webtype_itemgroupcat_catid key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id)) cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list)) if Result_list and len(Result_list) > 0: item_result_valList = self.jsonpage.parser_itemjson(Result_list) if item_result_valList and len(item_result_valList) > 0: item_json_index += 1 # the first item list is all online items if item_json_index == 1: if len(item_result_valList) > 0: print '# all online items.....' todayall_item_val = item_result_valList else: self.push_back_list(item_list, item_result_valList) else: print '# not get itemjson parse val list...' if len(item_list) > 0: self.parseItems(item_list) # cat pages json for key in cat_pages.keys(): _pages = (key,cat_pages[key]) self.mongofsAccess.insertJHSPages(_pages) # 解析从接口中获取的商品数据 def parseItems(self, item_list): print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 附加信息 a_val = (self.begin_time,) # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_list) > max_th: m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val) else: m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val) m_itemsObj.createthread() m_itemsObj.putItems(item_list) m_itemsObj.run() _items = m_itemsObj.items self.push_back_list(self.items,_items) print '# queue item num:',len(self.items) print '# parse item num:',len(_items) print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _obj == 'groupitem': self.processMulti(_val) else: self.processOne(_val) def processOne(self, _val=None): i, M = 0, 10 n = 0 while True: try: if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# all get catQ item num:',n print '# not get catQ of key:',_key break time.sleep(10) continue n += 1 self.crawlPage(_key, _msg, _val) except Exception as e: print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg def processMulti(self, _val=None): if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) try: self.crawlPageMulti(_key, _val) except Exception as e: print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key) # To crawl page def crawlPageMulti(self, _key, _val): if self._obj == 'groupitem': self.run_groupitem(_key, _val) else: print '# crawlPageMulti unknown obj = %s' % self._obj def run_groupitem(self, _key, _val): m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val) m_itemQ.createthread() m_itemQ.run() item_list = m_itemQ.items print '# crawl Items num: %d' % len(item_list) # 删除redis数据库过期商品 def delItem(self, _items): for _item in _items: keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.begin_time) # 删除过期的商品 if now_time > end_time: self.redisAccess.delete_jhsitem(keys) # 把商品信息存入redis数据库中 def putItemDB(self, _items): for _item in _items: # 忽略已经存在的商品ID keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): continue # 将商品基础数据写入redis item_val = self.message.itemInfo(_item["r_val"]) val = self.message.itemMsg(item_val) self.redisAccess.write_jhsitem(keys, val) # 更新商品信息 def updateItem(self, _item): keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: item_val = self.message.itemParseInfo(_item["r_val"]) c = False if item["start_time"] != item_val["start_time"]: item["start_time"] = item_val["start_time"] c = True if item["end_time"] != item_val["end_time"]: item["end_time"] = item_val["end_time"] c = True if c: self.redisAccess.write_jhsitem(keys, item) # 查找新商品 def selectNewItems(self, _items): new_items = [] for _item in _items: keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): self.updateItem(_item) continue new_items.append(_item["val"]) return new_items def scanEndItems(self): val = (Common.time_s(self.crawling_time),) _items = self.mysqlAccess.selectJhsGroupItemEnd(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanEndItemsLasthour(self): val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1)) _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums for last hour end:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanAliveItems(self): # 到结束时间后的一个小时 val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1)) # 查找已经开团但是没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemAlive(val) print "# hour all item nums:",len(_items) return _items def scanNotEndItems(self): val = (Common.time_s(self.crawling_time),) # 查找没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val) i = 1 for _item in _items: print i item_juid = str(_item[1]) keys = [self.worker_type, item_juid] item = self.redisAccess.read_jhsitem(keys) print item #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]} #self.redisAccess.write_jhsitem(keys, _new_item) i += 1 def scanCategories(self): category_list = self.mysqlAccess.selectJhsGroupItemCategory() return category_list
class JHSBrand(): '''A class of JHS category channel''' def __init__(self, m_type): # 抓取设置 self.crawler = RetryCrawler() # DB self.mysqlAccess = MysqlAccess() # mysql access # cat queue self.cat_queue = JHSQ('cat','main') # act queue self.act_queue = JHSQ('act','main') self.work = JHSWorker() # 默认类别 #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")] self.category_list = [ ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"), ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"), ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"), ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"), ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"), ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"), ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"), ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"), ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"), ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"), ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"), ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"), ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"), ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"), ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"), ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000") ] # 页面 self.site_page = None # 抓取开始时间 self.begin_time = Common.now() # 分布式主机标志 self.m_type = m_type def antPage(self): try: # 主机器需要配置redis队列 if self.m_type == 'm': category_list = self.mysqlAccess.selectJhsGroupItemCategory() if not category_list or len(category_list) == 0: category_list = self.category_list if category_list and len(category_list) > 0: cate_val_list = [] for cate in category_list: cate_val_list.append((cate[0],cate[2],cate[1],Config.ju_home_today,Config.JHS_GroupItem)) # 清空category redis队列 self.cat_queue.clearQ() # 保存category redis队列 self.cat_queue.putlistQ(cate_val_list) # 清空act redis队列 self.act_queue.clearQ() print '# category queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: print '# not find category...' # 类目的活动Json obj = 'cat' crawl_type = 'main' # 获取还没有开团的活动id val = (Common.time_s(Common.now()),) acts = self.mysqlAccess.selectJhsActNotStart(val) brandact_id_list = [] if acts: for act in acts: brandact_id_list.append(str(act[1])) _val = (self.begin_time, brandact_id_list) self.work.process(obj,crawl_type,_val) # 活动数据 act_val_list = self.work.items print '# act nums:', len(act_val_list) # 保存到redis队列 self.act_queue.putlistQ(act_val_list) print '# act queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if self.m_type == 'm': val = (Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -2),Common.add_hours(self.begin_time, -1)) # 删除Redis中上个小时结束的活动 _acts = self.mysqlAccess.selectJhsActEndLastOneHour(val) print '# end acts num:',len(_acts) self.work.delAct(_acts) # 删除Redis中上个小时结束的商品 _items = self.mysqlAccess.selectJhsItemEndLastOneHour(val) print '# end items num:',len(_items) self.work.delItem(_items) except Exception as e: print '# antpage error :',e Common.traceback_log() # 商品团频道 def categoryListTEMP(self): page = self.crawler.getData(Config.ju_home_today, Config.ju_home) if not page or page == '': print '# not get today page' category_list = [] m = re.search(r'<div class="J_CatLeft layout-left">.+?<table>(.+?)</table>.+?</div>',page,flags=re.S) if m: category_list = self.categoryListType1(m.group(1)) else: m = re.search(r'<div class="catbg">\s+<div class="ju-wrapper">\s+<div class="cat-menu-h".+?>.+?<ul class="clearfix">(.+?)</ul>',page,flags=re.S) if m: category_list = self.categoryListType2(m.group(1)) return category_list def categoryListType1(self,page): category_list = [] m = re.search(r'<tr class="h2">.+?</tr>(.+?)<tr class="h2">',page,flags=re.S) if m: cate_list = m.group(1) p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(cate_list): category_list.append((cate.group(1),cate.group(2).strip())) return category_list def categoryListType2(self,page): category_list = [] p = re.compile(r'<a.+?href="(.+?)".+?>(.+?)</a>',flags=re.S) for cate in p.finditer(page): category_list.append((cate.group(1),cate.group(2).strip())) return category_list
class JHSBrandPosition(): '''A class of JHS brand act position''' def __init__(self, m_type): # 抓取设置 self.crawler = RetryCrawler() # DB self.mysqlAccess = MysqlAccess() # mysql access # cat homeposition queue self.home_queue = JHSQ('cat', 'homeposition') # cat position queue self.cat_queue = JHSQ('cat','position') # act queue self.act_queue = JHSQ('act','position') self.work = JHSWorker() # 默认类别 #self.category_list = [("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000")] self.category_list = [ ("http://ju.taobao.com/jusp/nvzhuangpindao/tp.htm#J_FixedNav","女装","1000"), ("http://ju.taobao.com/jusp/nanzhuangpindao/tp.htm#J_FixedNav","男装","7000"), ("http://ju.taobao.com/jusp/xiebaopindao/tp.htm#J_FixedNav","鞋包","3000"), ("http://ju.taobao.com/jusp/neiyipindao/tp.htm#J_FixedNav","内衣","4000"), ("http://ju.taobao.com/jusp/zhubaoshipin/tp.htm#J_FixedNav","饰品","42000"), ("http://ju.taobao.com/jusp/yundongpindao/tp.htm#J_FixedNav","运动","38000"), ("http://ju.taobao.com/jusp/meizhuangpindao/tp.htm#J_FixedNav","美妆","2000"), ("http://ju.taobao.com/jusp/tongzhuangpindao/tp.htm#J_FixedNav","童装","23000"), ("http://ju.taobao.com/jusp/shipinpindao/tp.htm#J_FixedNav","零食","5000"), ("http://ju.taobao.com/jusp/muyingpindao/tp.htm#J_FixedNav","母婴","6000"), ("http://ju.taobao.com/jusp/baihuopindao/tp.htm#J_FixedNav","百货","37000"), ("http://ju.taobao.com/jusp/chepinpindao/tp.htm#J_FixedNav","汽车","36000"), ("http://ju.taobao.com/jusp/jiadianpindao/tp.htm#J_FixedNav","家电","34000"), ("http://ju.taobao.com/jusp/shumapindao/tp.htm#J_FixedNav","数码","43000"), ("http://ju.taobao.com/jusp/jiajunewpindao/tp.htm#J_FixedNav","家装","225000"), ("http://ju.taobao.com/jusp/jiajupindao/tp.htm#J_FixedNav","家纺","35000") ] # 页面 self.site_page = None # 抓取开始时间 self.begin_time = Common.now() # 分布式主机标志 self.m_type = m_type def antPage(self): try: # 主机器需要配置redis队列 if self.m_type == 'm': # 清空分类类表也home url redis队列 self.home_queue.clearQ() # 保存到redis队列 self.home_queue.putlistQ([(Config.ju_brand_home, Config.ju_home)]) print '# cat homeposition queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 商品团分类页面 category_list = self.mysqlAccess.selectJhsGroupItemCategory() if not category_list or len(category_list) == 0: category_list = self.category_list if category_list and len(category_list) > 0: cate_val_list = [] for cate in category_list: cate_val_list.append((cate[0],cate[2],cate[1],Config.ju_home_today,Config.JHS_GroupItem)) # 清空category redis队列 self.cat_queue.clearQ() # 保存category redis队列 self.cat_queue.putlistQ(cate_val_list) # 清空act redis队列 self.act_queue.clearQ() print '# category position queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: print '# not find category...' # 类目json url list obj = 'cat' crawl_type = 'homeposition' self.work.process(obj,crawl_type) # json url list json_val_list = self.work.items if json_val_list and len(json_val_list) > 0: # 保存到redis队列 self.cat_queue.putlistQ(json_val_list) print '# cat position queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 类目的活动Json obj = 'cat' crawl_type = 'position' # 获取还没有开团的活动id a_val = (self.begin_time,) self.work.process(obj,crawl_type,a_val) # 活动数据 act_val_list = self.work.items print '# act nums:', len(act_val_list) # 保存到redis队列 self.act_queue.putlistQ(act_val_list) print '# act queue end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) except Exception as e: print '# antpage error :',e Common.traceback_log()