class XCItemRedisM(MyThread): '''A class of xc Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.xc_type = Config.XC_TYPE # xc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self.xc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert item def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertXCItem(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'item' if self.xc_queue_type == 'spot': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len() > 0: self.mysqlAccess.insertXCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.xc_queue_type) # self.mongofsAccess.insertXCPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
class JHSWorker(): '''A class of jhs worker''' def __init__(self): # jhs brand type self.worker_type = Config.JHS_Brand # DB self.jhs_type = Config.JHS_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'cat': max_time = Config.json_crawl_retry elif _obj == 'act': max_time = Config.act_crawl_retry elif _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == 'cat': if _crawl_type == 'home' or _crawl_type == 'homeposition': self.run_cat_home(msg, _val) else: self.run_cat(msg, _val) elif _obj == 'act': self.run_act(msg) elif _obj == 'item': self.run_item(msg, _val) else: print '# crawlPage unknown obj = %s' % _obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_cat_home(self, msg, _val): msg_val = msg["val"] _url, refers = msg_val print '# brand home:',_url page = self.crawler.getData(_url, refers) # save to mongo # timeStr_jhstype_webtype_obj_crawltype time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type) p_content = '<!-- url=%s --> %s' % (_url,page) self.mongofsAccess.insertJHSPages((key,p_content)) c_url_val_list = self.brand_temp.temp(page) for c_url_val in c_url_val_list: c_url, c_name, c_id = c_url_val self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand)) if self._crawl_type == 'homeposition': top_acts = self.brand_temp.activityTopbrandTemp(page) print top_acts self.save_top_acts(top_acts) def save_top_acts(self, top_acts): if top_acts: for key in top_acts.keys(): act = top_acts[key] c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', '' c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time)) if act.has_key('act_id'): act_id = act["act_id"] if act.has_key('position'): act_position = act["position"] if act.has_key('url'): act_url = act["url"] if act.has_key('datatype'): f_name = act["datatype"] val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour) self.mysqlAccess.insertJhsActPosition_hour(val) def run_cat(self, msg, _val): msg_val = msg["val"] c_url, c_id, c_name, refers, pagetype = msg_val print '# category',c_name,c_id if pagetype == Config.JHS_Brand: a_val = (c_id, c_name) self.get_actjson(c_url, refers, a_val, _val, pagetype) elif pagetype == Config.JHS_GroupItem: self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype) else: print '# not get category pagetype...' def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype): a_val = (c_id, c_name) page = self.crawler.getData(c_url, refers) page_val = (page,c_id,c_name) ajax_url_list = self.getAjaxurlList(page_val) if len(ajax_url_list) > 0: # process ajax url list for url_val in ajax_url_list: c_url,c_subNav = url_val self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav) def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''): if self._crawl_type == 'position': _val = (pagetype,c_subNav) + _val Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val) if Result_list and len(Result_list) > 0: # parser act result act_valList = self.jsonpage.parser_brandjson(Result_list,_val) if act_valList and len(act_valList) > 0: print '# get brand act num:',len(act_valList) self.items.extend(act_valList) else: print '# not get brandjson parse val list...' # get json ajax url def getAjaxurlList(self, page_val): url_list = [] page, c_id, c_name = page_val p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = '' f_id = a_info.group(1) a_url = a_info.group(2).replace('amp;','') info = a_info.group(3) m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S) if m: c_subNav = m.group(1).strip() if c_subNav == '': m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S) if m: c_subNav = re.sub(r'<.+?>','',m.group(1)) #url_list.append((a_url,refers,a_val)) url_list.append((a_url,c_subNav)) i += 1 return url_list # ACT queue def run_act(self, msg): # 默认数据 msg_val = msg["val"] print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) act_obj = None if self._crawl_type == 'main': act_obj = JHSAct() act_obj.antPageMain(msg_val) elif self._crawl_type == 'check': act_obj = JHSAct() act_obj.antPageCheck(msg_val) elif self._crawl_type == 'position': act_obj = JHSAct() act_obj.antPageParser(msg_val) print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if self._crawl_type == 'position': brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition() if int(brandact_sign) != 3: if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time): print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) elif brandact_status != '' and brandact_status != 'blank': print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) else: act_keys = [self.worker_type, str(act_obj.brandact_id)] prev_act = self.redisAccess.read_jhsact(act_keys) # 是否需要抓取商品 if act_obj and act_obj.crawling_confirm != 2: # 保存的活动信息 self.putActDB(act_obj, prev_act) # 活动中的商品 items_list = [] # 只取非俪人购商品 if int(act_obj.brandact_sign) != 3: if act_obj.crawling_confirm == 0: #更新马上开团活动中商品位置 self.update_actItems_position(act_obj) # 多线程抓商品 items_list = self.run_actItems(act_obj, prev_act) else: print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 处理活动信息 #self.procAct(act_obj, prev_act, items_list) # 处理活动redis信息 self.procActRedis(act_obj, prev_act, items_list) #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: self.update_startact(act_obj, prev_act) print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) # 更新开团后活动 def update_startact(self, act, prev_act): if act.brandact_endtime and act.brandact_endtime != 0.0: end_time_s = Common.time_s(float(act.brandact_endtime)/1000) if prev_act and end_time_s != prev_act['end_time']: prev_act['end_time'] = end_time_s # redis keys = [self.worker_type, str(act.brandact_id)] self.redisAccess.write_jhsact(keys, prev_act) self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id))) #更新马上开团活动中商品位置 def update_actItems_position(self, act): update_val_list = [] act_id = act.brandact_id for item in act.brandact_itemVal_list: if str(item[7]) != '': update_val_list.append((str(item[7]),str(act_id),item[4])) self.mysqlAccess.updateJhsItemPosition(update_val_list) # 并行获取品牌团商品 def run_actItems(self, act, prev_act): print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 需要抓取的item item_val_list = [] # 过滤已经抓取过的商品ID列表 item_ids = act.brandact_itemids if prev_act: prev_item_ids = prev_act["item_ids"] item_ids = Common.diffSet(item_ids, prev_item_ids) # 如果已经抓取过的活动没有新上线商品,则退出 if len(item_ids) == 0: print '# Activity no new Items' print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return None for item in act.brandact_itemVal_list: if str(item[6]) in item_ids or str(item[7]) in item_ids: item_val_list.append(item) else: item_val_list = act.brandact_itemVal_list # 如果活动没有商品, 则退出 if len(item_ids) == 0: print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name) return None print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name # 多线程 控制并发的线程数 if len(item_val_list) > Config.item_max_th: m_itemsObj = JHSItemM('main', Config.item_max_th) else: m_itemsObj = JHSItemM('main', len(item_val_list)) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity find new Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return item_list # To merge activity def mergeAct(self, act, prev_act): if prev_act: # 合并本次和上次抓取的商品ID列表 prev_item_ids = prev_act["item_ids"] act.brandact_itemids = Common.unionSet(act.brandact_itemids, prev_item_ids) # 取第一次的活动抓取时间 act.crawling_time = Common.str2timestamp(prev_act["crawl_time"]) if not act.brandact_name or act.brandact_name == '': act.brandact_name = prev_act["act_name"] if not act.brandact_url or act.brandact_url == '': act.brandact_url = prev_act["act_url"] if not act.brandact_position or str(act.brandact_position) == '0': act.brandact_position = prev_act["act_position"] if not act.brandact_enterpic_url or act.brandact_enterpic_url == '': act.brandact_enterpic_url = prev_act["act_enterpic_url"] if not act.brandact_remindNum or str(act.brandact_remindNum) == '0': act.brandact_remindNum = prev_act["act_remindnum"] if not act.brandact_coupons or act.brandact_coupons == []: act.brandact_coupon = prev_act["act_coupon"] act.brandact_coupons = prev_act["act_coupons"].split(Config.sep) if not act.brandact_starttime or act.brandact_starttime == 0.0: act.brandact_starttime = Common.str2timestamp(prev_act["start_time"]) if not act.brandact_endtime or act.brandact_endtime == 0.0: act.brandact_endtime = Common.str2timestamp(prev_act["end_time"]) if not act.brandact_other_ids or act.brandact_other_ids == '': act.brandact_other_ids = prev_act["_act_ids"] # To put act db def putActDB(self, act, prev_act): # 预热信息 if self._crawl_type == 'main': self.mysqlAccess.insertJhsActComing(act.outSql()) # redis self.mergeAct(act, prev_act) if self._crawl_type == 'main': # mysql if prev_act: print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.updateJhsAct(act.outSqlForUpdate()) else: print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.insertJhsAct(act.outSql()) # mongo # 存网页 _pages = act.outItemPage(self._crawl_type) self.mongofsAccess.insertJHSPages(_pages) # To process activity in redis def procActRedis(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # redis self.mergeAct(act, prev_act) keys = [self.worker_type, str(act.brandact_id)] val = act.outTupleForRedis() self.redisAccess.write_jhsact(keys, val) # To process activity def procAct(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # 将抓取的活动信息存入redis self.putActDB(act, prev_act) # ITEM queue def run_item(self, msg, _val): # 默认数据 msg_val = msg["val"] brandact_id, brandact_name, item_val_list = msg_val print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_val_list) > max_th: m_itemsObj = JHSItemM(self._crawl_type, max_th, _val) else: m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) self.giveup_val = (brandact_id, brandact_name, giveup_items) raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == 'cat': M = 10 n = 0 while True: if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.jhs_type,_obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print '# all get num of item in queue:',n break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: print '# exception err in process of JHSWorker:',e,_key,_msg # 删除redis数据库过期活动 def delAct(self, _acts): i = 0 for _act in _acts: keys = [self.worker_type, str(_act[0])] item = self.redisAccess.read_jhsact(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的活动 if now_time > end_time: i += 1 self.redisAccess.delete_jhsact(keys) print '# delete acts num:',i def delItem(self, _items): i = 0 for _item in _items: keys = [self.worker_type, str(_item[0])] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的商品 if now_time > end_time: i += 1 self.redisAccess.delete_jhsitem(keys) print '# delete items num:',i # 查找结束的活动 def scanEndActs(self, val): _acts = self.mysqlAccess.selectJhsActEnd(val) print '# end acts num:',len(_acts) # 删除已经结束的活动 self.delAct(_acts) # 查找结束的商品 def scanEndItems(self, val): _items = self.mysqlAccess.selectJhsItemEnd(val) print '# end items num:',len(_items) # 删除已经结束的商品 self.delItem(_items) # acts redis def actsRedis(self): _acts = self.mysqlAccess.selectActsRedisdata() print '# acts num:',len(_acts) i = 0 for _act in _acts: act_id = _act[2] #_itemids = self.mysqlAccess.selectItemsids(str(act_id)) #item_ids = [] #for _itemid in _itemids: # item_ids.append(str(_itemid[0])) # item_ids.append(str(_itemid[1])) #act_val = _act + (item_ids,) #print act_val #keys = [self.worker_type, str(act_id)] #print keys #if self.redisAccess.exist_jhsact(keys): #act_redis = self.redisAccess.read_jhsact(keys) #if len(act_redis) != 15: # print act_redis # i += 1 #print self.redisAccess.read_jhsact(keys) #self.redisAccess.delete_jhsact(keys) #self.redisAccess.write_jhsact(keys, act_val) #i += 1 #break print '# redis acts num:',i # items redis def itemsRedis(self): _items = self.mysqlAccess.selectItemRedisdata() print '# items num:', len(_items) i = 0 #for _item in _items: #msg = self.message.jhsitemMsg(_item) #print msg #keys = [self.worker_type, str(_item[0])] #print keys #if self.redisAccess.exist_jhsitem(keys): #print self.redisAccess.read_jhsitem(keys) #self.redisAccess.delete_jhsitem(keys) #self.redisAccess.write_jhsitem(keys, msg) #i += 1 #break print '# redis items num:',i
class JHSGroupItemWorker(): '''A class of JHS group item channel worker''' def __init__(self): # jhs group item type self.worker_type = Config.JHS_GroupItem self.jhs_type = Config.JHS_TYPE # queue type # message self.message = Message() # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() # DB # mysql access self.mysqlAccess = MysqlAccess() # redis queue self.redisQueue = RedisQueue() # redis access self.redisAccess = RedisAccess() # mongodb fs access self.mongofsAccess = MongofsAccess() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e def push_back_list(self, L, v): L.extend(v) def push_back_val(self, L, v): L.append(v) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg def crawlPage(self, _key, msg, _val): try: if self._obj == 'groupitemcat': self.run_category(msg, _val) else: print '# crawlPage unknown obj = %s' % self._obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_category(self, msg, _val): category_val = msg["val"] refers = _val c_url,c_name,c_id = category_val print c_url,c_name,c_id page = self.crawler.getData(c_url, refers) page_val = (page,c_name,c_id) ajax_url_list = self.getAjaxurlList(page_val,c_url) if len(ajax_url_list) > 0: self.get_jsonitems(ajax_url_list) # get json ajax url def getAjaxurlList(self, page_val, refers=''): url_list = [] page, c_name, c_id = page_val p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = c_name a_url = a_info.group(1).replace('amp;','') info = a_info.group(2) m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S) if m: c_subNav = m.group(1).strip() a_val = (c_id,c_name,refers,c_subNav) url_list.append((a_url,refers,a_val)) i += 1 return url_list # get item json list in category page from ajax url def get_jsonitems(self, ajax_url_list): # today all items val todayall_item_val = [] # other sub nav items val item_list = [] # process ajax url list item_json_index = 0 # mongo json pages cat_pages = {} for a_url in ajax_url_list: # get json from ajax url Result_list = self.jsonpage.get_json([a_url]) # mongo page json _url,_refers,_val = a_url _c_id = _val[0] time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) # timeStr_jhstype_webtype_itemgroupcat_catid key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id)) cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list)) if Result_list and len(Result_list) > 0: item_result_valList = self.jsonpage.parser_itemjson(Result_list) if item_result_valList and len(item_result_valList) > 0: item_json_index += 1 # the first item list is all online items if item_json_index == 1: if len(item_result_valList) > 0: print '# all online items.....' todayall_item_val = item_result_valList else: self.push_back_list(item_list, item_result_valList) else: print '# not get itemjson parse val list...' if len(item_list) > 0: self.parseItems(item_list) # cat pages json for key in cat_pages.keys(): _pages = (key,cat_pages[key]) self.mongofsAccess.insertJHSPages(_pages) # 解析从接口中获取的商品数据 def parseItems(self, item_list): print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 附加信息 a_val = (self.begin_time,) # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_list) > max_th: m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val) else: m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val) m_itemsObj.createthread() m_itemsObj.putItems(item_list) m_itemsObj.run() _items = m_itemsObj.items self.push_back_list(self.items,_items) print '# queue item num:',len(self.items) print '# parse item num:',len(_items) print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _obj == 'groupitem': self.processMulti(_val) else: self.processOne(_val) def processOne(self, _val=None): i, M = 0, 10 n = 0 while True: try: if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# all get catQ item num:',n print '# not get catQ of key:',_key break time.sleep(10) continue n += 1 self.crawlPage(_key, _msg, _val) except Exception as e: print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg def processMulti(self, _val=None): if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) try: self.crawlPageMulti(_key, _val) except Exception as e: print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key) # To crawl page def crawlPageMulti(self, _key, _val): if self._obj == 'groupitem': self.run_groupitem(_key, _val) else: print '# crawlPageMulti unknown obj = %s' % self._obj def run_groupitem(self, _key, _val): m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val) m_itemQ.createthread() m_itemQ.run() item_list = m_itemQ.items print '# crawl Items num: %d' % len(item_list) # 删除redis数据库过期商品 def delItem(self, _items): for _item in _items: keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.begin_time) # 删除过期的商品 if now_time > end_time: self.redisAccess.delete_jhsitem(keys) # 把商品信息存入redis数据库中 def putItemDB(self, _items): for _item in _items: # 忽略已经存在的商品ID keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): continue # 将商品基础数据写入redis item_val = self.message.itemInfo(_item["r_val"]) val = self.message.itemMsg(item_val) self.redisAccess.write_jhsitem(keys, val) # 更新商品信息 def updateItem(self, _item): keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: item_val = self.message.itemParseInfo(_item["r_val"]) c = False if item["start_time"] != item_val["start_time"]: item["start_time"] = item_val["start_time"] c = True if item["end_time"] != item_val["end_time"]: item["end_time"] = item_val["end_time"] c = True if c: self.redisAccess.write_jhsitem(keys, item) # 查找新商品 def selectNewItems(self, _items): new_items = [] for _item in _items: keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): self.updateItem(_item) continue new_items.append(_item["val"]) return new_items def scanEndItems(self): val = (Common.time_s(self.crawling_time),) _items = self.mysqlAccess.selectJhsGroupItemEnd(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanEndItemsLasthour(self): val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1)) _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums for last hour end:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanAliveItems(self): # 到结束时间后的一个小时 val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1)) # 查找已经开团但是没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemAlive(val) print "# hour all item nums:",len(_items) return _items def scanNotEndItems(self): val = (Common.time_s(self.crawling_time),) # 查找没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val) i = 1 for _item in _items: print i item_juid = str(_item[1]) keys = [self.worker_type, item_juid] item = self.redisAccess.read_jhsitem(keys) print item #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]} #self.redisAccess.write_jhsitem(keys, _new_item) i += 1 def scanCategories(self): category_list = self.mysqlAccess.selectJhsGroupItemCategory() return category_list
class TCWorker: """A class of tc worker""" def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = "ikuai" # self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type) def init_log(self, _obj, _crawl_type): if not Logger.logger: loggername = "other" filename = "crawler_%s" % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) if _obj == "channel": loggername = "channel" filename = "add_%s_%s" % (_crawl_type, time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) # elif _obj == 'item': Logger.config_logging(loggername, filename) # To dial router def dialRouter(self, _type, _obj): try: _module = "%s_%s" % (_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: Common.log("# To dial router exception: %s" % e) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg["retry"] += 1 _retry = msg["retry"] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == "channel": max_time = Config.channel_crawl_retry elif _obj == "item": max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: # self.push_back(self.giveup_items, msg) Common.log("# retry too many time, no get msg:") Common.log(msg) # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == "channel": self.run_channel(msg) else: Common.log("# crawlPage unknown obj = %s" % _obj) except Common.InvalidPageException as e: Common.log("# Invalid page exception: %s" % e) self.crawlRetry(_key, msg) except Common.DenypageException as e: Common.log("# Deny page exception: %s" % e) self.crawlRetry(_key, msg) # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: Common.log("# System busy exception: %s" % e) self.crawlRetry(_key, msg) time.sleep(random.uniform(10, 30)) except Common.RetryException as e: Common.log("# Retry exception: %s" % e) if self.giveup_val: msg["val"] = self.giveup_val self.crawlRetry(_key, msg) time.sleep(random.uniform(20, 30)) except Exception as e: Common.log("# exception err: %s" % e) self.crawlRetry(_key, msg) Common.traceback_log() if str(e).find("Read timed out") == -1: # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) def run_channel(self, msg): msg_val = msg["val"] c = Channel() c.antPage(msg_val) # self.items = c.channel_items self.run_items(c) # 并行获取商品 def run_items(self, chan): Common.log("# Items start, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) # 多线程 控制并发的线程数 Common.log("# Items num: %d" % len(chan.channel_items)) if len(chan.channel_items) > Config.item_max_th: m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th) else: m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items)) m_itemsObj.createthread() m_itemsObj.putItems(chan.channel_items) m_itemsObj.run() item_list = m_itemsObj.items Common.log("# find Items num: %d" % len(chan.channel_items)) Common.log("# crawl Items num: %d" % len(item_list)) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: Common.log("# giveup Items num: %d" % len(giveup_items)) raise Common.RetryException("# run_items: some items retry more than max times..") Common.log("# Items end, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) def process(self, _obj, _crawl_type, _val=None): # self.processMulti(_obj, _crawl_type, _val) self.processOne(_obj, _crawl_type, _val) def processOne(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == "channel": M = 2 n = 0 while True: if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: Common.log("# not get queue of key: %s" % _key) Common.log("# all get num of item in queue: %d" % n) break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: Common.log("# exception err in process of TCWorker: %s , key: %s" % (e, _key)) Common.log(_msg) def processMulti(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) try: self.crawlPageMulti(_obj, _crawl_type, _key, _val) except Exception as e: Common.log("# exception err in processMulti of TCWorker: %s, key: %s" % (e, _key)) # To crawl page def crawlPageMulti(self, _obj, _crawl_type, _key, _val): self.run_multiitems(_key, _val) # Common.log('# crawlPageMulti unknown obj = %s' % _obj) def run_multiitems(self, _key, _val): mitem = TCItemRedisM(_key, self._crawl_type, 20, _val) mitem.createthread() mitem.run() item_list = mitem.items Common.log("# crawl Items num: %d" % len(item_list))
class TCItemRedisM(MyThread): '''A class of tc Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.tc_type = Config.TC_TYPE # tc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self.tc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert item def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertTCItem(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'item' if self.tc_queue_type == 'spot': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len() > 0: self.mysqlAccess.insertTCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.tc_queue_type) # self.mongofsAccess.insertTCPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
class JHSGroupItemQM(MyThread): '''A class of jhs Item redis queue''' def __init__(self, itemtype, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.jhs_type = Config.JHS_TYPE # jhs type self.item_type = itemtype # item type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_queue_type = q_type # h:每小时 self._key = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type) # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() # To crawl retry def crawlRetry(self, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(self._key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list) #print '# insert data to database' return True return False # insert item hour def insertItemhour(self, itemhoursql_list, f=False): if f or len(itemhoursql_list) >= Config.item_max_arg: if len(itemhoursql_list) > 0: self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list) #print '# insert hour data to database' return True return False # item sql list def crawl(self): _iteminfosql_list = [] _itemhoursql_list = [] i, M = 0, 10 n = 0 while True: try: _msg = self.redisQueue.get_q(self._key) # 队列为空 if not _msg: # 队列为空,退出 #print '# queue is empty', e # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] # hour self.insertItemhour(_itemhoursql_list, True) _itemhoursql_list = [] i += 1 if i > M: print '# all get itemQ item num:',n print '# not get itemQ of key:',self._key break time.sleep(10) continue n += 1 item = None crawl_type = '' if self.jhs_queue_type == 'hour': # 每小时一次商品实例 item = JHSItem() _val = _msg["val"] if self.a_val: _val = _val + self.a_val item.antPageGroupItemHour(_val) crawl_type = 'groupitem' # 汇聚 self.push_back(self.items, item.outTupleGroupItemHour()) # 入库 update_Sql,hourSql = item.outTupleGroupItemHour() if update_Sql: self.mysqlAccess.updateJhsGroupItem(update_Sql) _itemhoursql_list.append(hourSql) if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = [] elif self.jhs_queue_type == 'new': # 商品信息 item = JHSItem() _val = _msg["val"] if self.a_val: _val = _val + self.a_val item.antPageGroupItem(_val) crawl_type = 'groupitemnew' # 汇聚 self.push_back(self.items, item.outTupleGroupItem()) # 入库 iteminfoSql = item.outTupleGroupItem() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 if item and crawl_type != '': _pages = item.outItemPage(crawl_type) self.mongofsAccess.insertJHSPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: print 'Not item exception :', e except Common.NoPageException as e: print 'Not page exception :', e except Common.InvalidPageException as e: self.crawlRetry(_msg) print 'Invalid page exception :', e except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'item') except Exception as e: print '# DailClient Exception err:', e time.sleep(10) time.sleep(random.uniform(10,30))
class TCWorker(): '''A class of tc worker''' def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type) def init_log(self, _obj, _crawl_type): if not Logger.logger: loggername = 'other' filename = 'crawler_%s' % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) if _obj == 'channel': loggername = 'channel' filename = 'add_%s_%s' % (_crawl_type,time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) #elif _obj == 'item': Logger.config_logging(loggername, filename) # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'channel': max_time = Config.channel_crawl_retry elif _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) Common.log("# retry too many time, no get msg:") Common.log(msg) # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == 'channel': self.run_channel(msg) else: Common.log('# crawlPage unknown obj = %s' % _obj) except Common.InvalidPageException as e: Common.log('# Invalid page exception: %s' % e) self.crawlRetry(_key,msg) except Common.DenypageException as e: Common.log('# Deny page exception: %s' % e) self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: Common.log('# System busy exception: %s' % e) self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: Common.log('# Retry exception: %s' % e) if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: Common.log('# exception err: %s' % e) self.crawlRetry(_key,msg) Common.traceback_log() if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) def run_channel(self, msg): msg_val = msg["val"] c = Channel() c.antPage(msg_val) #self.items = c.channel_items self.run_items(c) # 并行获取商品 def run_items(self, chan): Common.log('# Items start, channel_id:%s, channel_name:%s' % (str(chan.channel_id), chan.channel_name)) # 多线程 控制并发的线程数 Common.log('# Items num: %d' % len(chan.channel_items)) if len(chan.channel_items) > Config.item_max_th: m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th) else: m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items)) m_itemsObj.createthread() m_itemsObj.putItems(chan.channel_items) m_itemsObj.run() item_list = m_itemsObj.items Common.log('# find Items num: %d' % len(chan.channel_items)) Common.log('# crawl Items num: %d' % len(item_list)) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: Common.log('# giveup Items num: %d' % len(giveup_items)) raise Common.RetryException('# run_items: some items retry more than max times..') Common.log('# Items end, channel_id:%s, channel_name:%s' % (str(chan.channel_id), chan.channel_name)) def process(self, _obj, _crawl_type, _val=None): #self.processMulti(_obj, _crawl_type, _val) self.processOne(_obj, _crawl_type, _val) def processOne(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == 'channel': M = 2 n = 0 while True: if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.tc_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.tc_type,_obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: Common.log('# not get queue of key: %s' % _key) Common.log('# all get num of item in queue: %d' % n) break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: Common.log('# exception err in process of TCWorker: %s , key: %s' % (e,_key)) Common.log(_msg) def processMulti(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.tc_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.tc_type,_obj) try: self.crawlPageMulti(_obj, _crawl_type, _key, _val) except Exception as e: Common.log('# exception err in processMulti of TCWorker: %s, key: %s' % (e,_key)) # To crawl page def crawlPageMulti(self, _obj, _crawl_type, _key, _val): self.run_multiitems(_key, _val) #Common.log('# crawlPageMulti unknown obj = %s' % _obj) def run_multiitems(self, _key, _val): mitem = TCItemRedisM(_key, self._crawl_type, 20, _val) mitem.createthread() mitem.run() item_list = mitem.items Common.log('# crawl Items num: %d' % len(item_list))