def __init__(self): # 抓取设置 self.crawler = TCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1
def __init__(self, jhs_type, thread_num = 15, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # appendix val self.a_val = a_val # jhs queue type self.jhs_type = jhs_type # 1:即将上线品牌团频道页, 2:检查每天还没结束的活动, 3:新增活动 # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai'
def __init__(self, jhs_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_type = jhs_type # h:每小时, i:商品信息详情 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self): # 抓取设置 self.crawler = TBCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1
def __init__(self): # 抓取设置 self.crawler = TBCrawler() self.val_queue = Queue.Queue() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai'
def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type)
def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.XC_Spot # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self._q_type = _q_type # new:新增商品 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.xc_type = Config.XC_TYPE # xc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self.xc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None
def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.JHS_Brand # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self._q_type = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.TC_Spot # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self._q_type = _q_type # new:新增商品 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self, itemtype, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.jhs_type = Config.JHS_TYPE # jhs type self.item_type = itemtype # item type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_queue_type = q_type # h:每小时 self._key = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type) # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.tc_type = Config.TC_TYPE # tc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self.tc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
class JHSWorker(): '''A class of jhs worker''' def __init__(self): # jhs brand type self.worker_type = Config.JHS_Brand # DB self.jhs_type = Config.JHS_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'cat': max_time = Config.json_crawl_retry elif _obj == 'act': max_time = Config.act_crawl_retry elif _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == 'cat': if _crawl_type == 'home' or _crawl_type == 'homeposition': self.run_cat_home(msg, _val) else: self.run_cat(msg, _val) elif _obj == 'act': self.run_act(msg) elif _obj == 'item': self.run_item(msg, _val) else: print '# crawlPage unknown obj = %s' % _obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_cat_home(self, msg, _val): msg_val = msg["val"] _url, refers = msg_val print '# brand home:',_url page = self.crawler.getData(_url, refers) # save to mongo # timeStr_jhstype_webtype_obj_crawltype time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type) p_content = '<!-- url=%s --> %s' % (_url,page) self.mongofsAccess.insertJHSPages((key,p_content)) c_url_val_list = self.brand_temp.temp(page) for c_url_val in c_url_val_list: c_url, c_name, c_id = c_url_val self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand)) if self._crawl_type == 'homeposition': top_acts = self.brand_temp.activityTopbrandTemp(page) print top_acts self.save_top_acts(top_acts) def save_top_acts(self, top_acts): if top_acts: for key in top_acts.keys(): act = top_acts[key] c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', '' c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time)) if act.has_key('act_id'): act_id = act["act_id"] if act.has_key('position'): act_position = act["position"] if act.has_key('url'): act_url = act["url"] if act.has_key('datatype'): f_name = act["datatype"] val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour) self.mysqlAccess.insertJhsActPosition_hour(val) def run_cat(self, msg, _val): msg_val = msg["val"] c_url, c_id, c_name, refers, pagetype = msg_val print '# category',c_name,c_id if pagetype == Config.JHS_Brand: a_val = (c_id, c_name) self.get_actjson(c_url, refers, a_val, _val, pagetype) elif pagetype == Config.JHS_GroupItem: self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype) else: print '# not get category pagetype...' def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype): a_val = (c_id, c_name) page = self.crawler.getData(c_url, refers) page_val = (page,c_id,c_name) ajax_url_list = self.getAjaxurlList(page_val) if len(ajax_url_list) > 0: # process ajax url list for url_val in ajax_url_list: c_url,c_subNav = url_val self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav) def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''): if self._crawl_type == 'position': _val = (pagetype,c_subNav) + _val Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val) if Result_list and len(Result_list) > 0: # parser act result act_valList = self.jsonpage.parser_brandjson(Result_list,_val) if act_valList and len(act_valList) > 0: print '# get brand act num:',len(act_valList) self.items.extend(act_valList) else: print '# not get brandjson parse val list...' # get json ajax url def getAjaxurlList(self, page_val): url_list = [] page, c_id, c_name = page_val p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = '' f_id = a_info.group(1) a_url = a_info.group(2).replace('amp;','') info = a_info.group(3) m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S) if m: c_subNav = m.group(1).strip() if c_subNav == '': m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S) if m: c_subNav = re.sub(r'<.+?>','',m.group(1)) #url_list.append((a_url,refers,a_val)) url_list.append((a_url,c_subNav)) i += 1 return url_list # ACT queue def run_act(self, msg): # 默认数据 msg_val = msg["val"] print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) act_obj = None if self._crawl_type == 'main': act_obj = JHSAct() act_obj.antPageMain(msg_val) elif self._crawl_type == 'check': act_obj = JHSAct() act_obj.antPageCheck(msg_val) elif self._crawl_type == 'position': act_obj = JHSAct() act_obj.antPageParser(msg_val) print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if self._crawl_type == 'position': brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition() if int(brandact_sign) != 3: if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time): print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) elif brandact_status != '' and brandact_status != 'blank': print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) else: act_keys = [self.worker_type, str(act_obj.brandact_id)] prev_act = self.redisAccess.read_jhsact(act_keys) # 是否需要抓取商品 if act_obj and act_obj.crawling_confirm != 2: # 保存的活动信息 self.putActDB(act_obj, prev_act) # 活动中的商品 items_list = [] # 只取非俪人购商品 if int(act_obj.brandact_sign) != 3: if act_obj.crawling_confirm == 0: #更新马上开团活动中商品位置 self.update_actItems_position(act_obj) # 多线程抓商品 items_list = self.run_actItems(act_obj, prev_act) else: print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 处理活动信息 #self.procAct(act_obj, prev_act, items_list) # 处理活动redis信息 self.procActRedis(act_obj, prev_act, items_list) #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: self.update_startact(act_obj, prev_act) print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) # 更新开团后活动 def update_startact(self, act, prev_act): if act.brandact_endtime and act.brandact_endtime != 0.0: end_time_s = Common.time_s(float(act.brandact_endtime)/1000) if prev_act and end_time_s != prev_act['end_time']: prev_act['end_time'] = end_time_s # redis keys = [self.worker_type, str(act.brandact_id)] self.redisAccess.write_jhsact(keys, prev_act) self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id))) #更新马上开团活动中商品位置 def update_actItems_position(self, act): update_val_list = [] act_id = act.brandact_id for item in act.brandact_itemVal_list: if str(item[7]) != '': update_val_list.append((str(item[7]),str(act_id),item[4])) self.mysqlAccess.updateJhsItemPosition(update_val_list) # 并行获取品牌团商品 def run_actItems(self, act, prev_act): print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 需要抓取的item item_val_list = [] # 过滤已经抓取过的商品ID列表 item_ids = act.brandact_itemids if prev_act: prev_item_ids = prev_act["item_ids"] item_ids = Common.diffSet(item_ids, prev_item_ids) # 如果已经抓取过的活动没有新上线商品,则退出 if len(item_ids) == 0: print '# Activity no new Items' print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return None for item in act.brandact_itemVal_list: if str(item[6]) in item_ids or str(item[7]) in item_ids: item_val_list.append(item) else: item_val_list = act.brandact_itemVal_list # 如果活动没有商品, 则退出 if len(item_ids) == 0: print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name) return None print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name # 多线程 控制并发的线程数 if len(item_val_list) > Config.item_max_th: m_itemsObj = JHSItemM('main', Config.item_max_th) else: m_itemsObj = JHSItemM('main', len(item_val_list)) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity find new Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return item_list # To merge activity def mergeAct(self, act, prev_act): if prev_act: # 合并本次和上次抓取的商品ID列表 prev_item_ids = prev_act["item_ids"] act.brandact_itemids = Common.unionSet(act.brandact_itemids, prev_item_ids) # 取第一次的活动抓取时间 act.crawling_time = Common.str2timestamp(prev_act["crawl_time"]) if not act.brandact_name or act.brandact_name == '': act.brandact_name = prev_act["act_name"] if not act.brandact_url or act.brandact_url == '': act.brandact_url = prev_act["act_url"] if not act.brandact_position or str(act.brandact_position) == '0': act.brandact_position = prev_act["act_position"] if not act.brandact_enterpic_url or act.brandact_enterpic_url == '': act.brandact_enterpic_url = prev_act["act_enterpic_url"] if not act.brandact_remindNum or str(act.brandact_remindNum) == '0': act.brandact_remindNum = prev_act["act_remindnum"] if not act.brandact_coupons or act.brandact_coupons == []: act.brandact_coupon = prev_act["act_coupon"] act.brandact_coupons = prev_act["act_coupons"].split(Config.sep) if not act.brandact_starttime or act.brandact_starttime == 0.0: act.brandact_starttime = Common.str2timestamp(prev_act["start_time"]) if not act.brandact_endtime or act.brandact_endtime == 0.0: act.brandact_endtime = Common.str2timestamp(prev_act["end_time"]) if not act.brandact_other_ids or act.brandact_other_ids == '': act.brandact_other_ids = prev_act["_act_ids"] # To put act db def putActDB(self, act, prev_act): # 预热信息 if self._crawl_type == 'main': self.mysqlAccess.insertJhsActComing(act.outSql()) # redis self.mergeAct(act, prev_act) if self._crawl_type == 'main': # mysql if prev_act: print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.updateJhsAct(act.outSqlForUpdate()) else: print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.insertJhsAct(act.outSql()) # mongo # 存网页 _pages = act.outItemPage(self._crawl_type) self.mongofsAccess.insertJHSPages(_pages) # To process activity in redis def procActRedis(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # redis self.mergeAct(act, prev_act) keys = [self.worker_type, str(act.brandact_id)] val = act.outTupleForRedis() self.redisAccess.write_jhsact(keys, val) # To process activity def procAct(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # 将抓取的活动信息存入redis self.putActDB(act, prev_act) # ITEM queue def run_item(self, msg, _val): # 默认数据 msg_val = msg["val"] brandact_id, brandact_name, item_val_list = msg_val print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_val_list) > max_th: m_itemsObj = JHSItemM(self._crawl_type, max_th, _val) else: m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) self.giveup_val = (brandact_id, brandact_name, giveup_items) raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == 'cat': M = 10 n = 0 while True: if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.jhs_type,_obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print '# all get num of item in queue:',n break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: print '# exception err in process of JHSWorker:',e,_key,_msg # 删除redis数据库过期活动 def delAct(self, _acts): i = 0 for _act in _acts: keys = [self.worker_type, str(_act[0])] item = self.redisAccess.read_jhsact(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的活动 if now_time > end_time: i += 1 self.redisAccess.delete_jhsact(keys) print '# delete acts num:',i def delItem(self, _items): i = 0 for _item in _items: keys = [self.worker_type, str(_item[0])] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的商品 if now_time > end_time: i += 1 self.redisAccess.delete_jhsitem(keys) print '# delete items num:',i # 查找结束的活动 def scanEndActs(self, val): _acts = self.mysqlAccess.selectJhsActEnd(val) print '# end acts num:',len(_acts) # 删除已经结束的活动 self.delAct(_acts) # 查找结束的商品 def scanEndItems(self, val): _items = self.mysqlAccess.selectJhsItemEnd(val) print '# end items num:',len(_items) # 删除已经结束的商品 self.delItem(_items) # acts redis def actsRedis(self): _acts = self.mysqlAccess.selectActsRedisdata() print '# acts num:',len(_acts) i = 0 for _act in _acts: act_id = _act[2] #_itemids = self.mysqlAccess.selectItemsids(str(act_id)) #item_ids = [] #for _itemid in _itemids: # item_ids.append(str(_itemid[0])) # item_ids.append(str(_itemid[1])) #act_val = _act + (item_ids,) #print act_val #keys = [self.worker_type, str(act_id)] #print keys #if self.redisAccess.exist_jhsact(keys): #act_redis = self.redisAccess.read_jhsact(keys) #if len(act_redis) != 15: # print act_redis # i += 1 #print self.redisAccess.read_jhsact(keys) #self.redisAccess.delete_jhsact(keys) #self.redisAccess.write_jhsact(keys, act_val) #i += 1 #break print '# redis acts num:',i # items redis def itemsRedis(self): _items = self.mysqlAccess.selectItemRedisdata() print '# items num:', len(_items) i = 0 #for _item in _items: #msg = self.message.jhsitemMsg(_item) #print msg #keys = [self.worker_type, str(_item[0])] #print keys #if self.redisAccess.exist_jhsitem(keys): #print self.redisAccess.read_jhsitem(keys) #self.redisAccess.delete_jhsitem(keys) #self.redisAccess.write_jhsitem(keys, msg) #i += 1 #break print '# redis items num:',i
class XCItemM(MyThread): '''A class of xc item thread manager''' def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.XC_Spot # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self._q_type = _q_type # new:新增商品 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.item_crawl_retry: _data = (_retry, _val) self.put_q(_data) else: self.push_back(self.giveup_items, _val) Common.log('# retry too many times, no get item:') Common.log(_val) # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertXCItem(iteminfosql_list) return True return False # To crawl item def crawl(self): # item sql list _iteminfosql_list = [] _itemdaysql_list = [] _itemhoursql_list = [] _itemupdatesql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] break item = None obj = 'item' if self._q_type == 'spot': # 新商品实例 item = Item() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len(tickets) > 0: self.mysqlAccess.insertXCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] # 存网页 #if item: # _pages = item.outItemPage(obj, self._q_type) # self.mongofsAccess.insertXCPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) Common.log('# Invalid page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: Common.log(_data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,40))
class TCItemRedisM(MyThread): '''A class of tc Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.tc_type = Config.TC_TYPE # tc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self.tc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert item def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertTCItem(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'item' if self.tc_queue_type == 'spot': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len() > 0: self.mysqlAccess.insertTCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.tc_queue_type) # self.mongofsAccess.insertTCPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
class XCItemRedisM(MyThread): '''A class of xc Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.xc_type = Config.XC_TYPE # xc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self.xc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert item def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertXCItem(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'item' if self.xc_queue_type == 'spot': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len() > 0: self.mysqlAccess.insertXCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.xc_queue_type) # self.mongofsAccess.insertXCPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
class TCWorker(): '''A class of tc worker''' def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type) def init_log(self, _obj, _crawl_type): if not Logger.logger: loggername = 'other' filename = 'crawler_%s' % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) if _obj == 'channel': loggername = 'channel' filename = 'add_%s_%s' % (_crawl_type,time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) #elif _obj == 'item': Logger.config_logging(loggername, filename) # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'channel': max_time = Config.channel_crawl_retry elif _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) Common.log("# retry too many time, no get msg:") Common.log(msg) # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == 'channel': self.run_channel(msg) else: Common.log('# crawlPage unknown obj = %s' % _obj) except Common.InvalidPageException as e: Common.log('# Invalid page exception: %s' % e) self.crawlRetry(_key,msg) except Common.DenypageException as e: Common.log('# Deny page exception: %s' % e) self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: Common.log('# System busy exception: %s' % e) self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: Common.log('# Retry exception: %s' % e) if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: Common.log('# exception err: %s' % e) self.crawlRetry(_key,msg) Common.traceback_log() if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10,30)) def run_channel(self, msg): msg_val = msg["val"] c = Channel() c.antPage(msg_val) #self.items = c.channel_items self.run_items(c) # 并行获取商品 def run_items(self, chan): Common.log('# Items start, channel_id:%s, channel_name:%s' % (str(chan.channel_id), chan.channel_name)) # 多线程 控制并发的线程数 Common.log('# Items num: %d' % len(chan.channel_items)) if len(chan.channel_items) > Config.item_max_th: m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th) else: m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items)) m_itemsObj.createthread() m_itemsObj.putItems(chan.channel_items) m_itemsObj.run() item_list = m_itemsObj.items Common.log('# find Items num: %d' % len(chan.channel_items)) Common.log('# crawl Items num: %d' % len(item_list)) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: Common.log('# giveup Items num: %d' % len(giveup_items)) raise Common.RetryException('# run_items: some items retry more than max times..') Common.log('# Items end, channel_id:%s, channel_name:%s' % (str(chan.channel_id), chan.channel_name)) def process(self, _obj, _crawl_type, _val=None): #self.processMulti(_obj, _crawl_type, _val) self.processOne(_obj, _crawl_type, _val) def processOne(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == 'channel': M = 2 n = 0 while True: if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.tc_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.tc_type,_obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: Common.log('# not get queue of key: %s' % _key) Common.log('# all get num of item in queue: %d' % n) break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: Common.log('# exception err in process of TCWorker: %s , key: %s' % (e,_key)) Common.log(_msg) def processMulti(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.tc_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.tc_type,_obj) try: self.crawlPageMulti(_obj, _crawl_type, _key, _val) except Exception as e: Common.log('# exception err in processMulti of TCWorker: %s, key: %s' % (e,_key)) # To crawl page def crawlPageMulti(self, _obj, _crawl_type, _key, _val): self.run_multiitems(_key, _val) #Common.log('# crawlPageMulti unknown obj = %s' % _obj) def run_multiitems(self, _key, _val): mitem = TCItemRedisM(_key, self._crawl_type, 20, _val) mitem.createthread() mitem.run() item_list = mitem.items Common.log('# crawl Items num: %d' % len(item_list))
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = TCCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1 # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' % (_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 Common.log('# Invalid page exception: %s' % e) time.sleep(self.w_time * retry) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 Common.log('# Deny page exception: %s' % e) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 Common.log('# System busy exception: %s' % e) time.sleep(self.w_time * retry) except Exception as e: Common.log('# exception err in retry crawler: %s' % e) if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 time.sleep(random.uniform(1, 3)) elif str(e).find('Name or service not known') != -1 or str( e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(random.uniform(10, 30)) else: break return page
class JHSGroupItemQM(MyThread): '''A class of jhs Item redis queue''' def __init__(self, itemtype, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.jhs_type = Config.JHS_TYPE # jhs type self.item_type = itemtype # item type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_queue_type = q_type # h:每小时 self._key = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type) # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() # To crawl retry def crawlRetry(self, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(self._key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list) #print '# insert data to database' return True return False # insert item hour def insertItemhour(self, itemhoursql_list, f=False): if f or len(itemhoursql_list) >= Config.item_max_arg: if len(itemhoursql_list) > 0: self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list) #print '# insert hour data to database' return True return False # item sql list def crawl(self): _iteminfosql_list = [] _itemhoursql_list = [] i, M = 0, 10 n = 0 while True: try: _msg = self.redisQueue.get_q(self._key) # 队列为空 if not _msg: # 队列为空,退出 #print '# queue is empty', e # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] # hour self.insertItemhour(_itemhoursql_list, True) _itemhoursql_list = [] i += 1 if i > M: print '# all get itemQ item num:',n print '# not get itemQ of key:',self._key break time.sleep(10) continue n += 1 item = None crawl_type = '' if self.jhs_queue_type == 'hour': # 每小时一次商品实例 item = JHSItem() _val = _msg["val"] if self.a_val: _val = _val + self.a_val item.antPageGroupItemHour(_val) crawl_type = 'groupitem' # 汇聚 self.push_back(self.items, item.outTupleGroupItemHour()) # 入库 update_Sql,hourSql = item.outTupleGroupItemHour() if update_Sql: self.mysqlAccess.updateJhsGroupItem(update_Sql) _itemhoursql_list.append(hourSql) if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = [] elif self.jhs_queue_type == 'new': # 商品信息 item = JHSItem() _val = _msg["val"] if self.a_val: _val = _val + self.a_val item.antPageGroupItem(_val) crawl_type = 'groupitemnew' # 汇聚 self.push_back(self.items, item.outTupleGroupItem()) # 入库 iteminfoSql = item.outTupleGroupItem() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 if item and crawl_type != '': _pages = item.outItemPage(crawl_type) self.mongofsAccess.insertJHSPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: print 'Not item exception :', e except Common.NoPageException as e: print 'Not page exception :', e except Common.InvalidPageException as e: self.crawlRetry(_msg) print 'Invalid page exception :', e except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'item') except Exception as e: print '# DailClient Exception err:', e time.sleep(10) time.sleep(random.uniform(10,30))
class Jsonpage(): '''A class of json page''' def __init__(self): # 抓取设置 self.crawler = TBCrawler() self.val_queue = Queue.Queue() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' def putVal(self, _val): self.val_queue.put((0,_val),block=False) def putVals(self, _vals): for _val in _vals: self.val_queue.put((0, _val),block=False) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.json_crawl_retry: _data = (_retry, _val) self.val_queue.put(_data,block=False) else: print "# retry too many times, no get json:", _val # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def get_json(self, json_valList): bResult_list = [] if json_valList and json_valList != []: self.putVals(json_valList) while True: _data = None try: try: # 取队列消息 _data = self.val_queue.get(block=False) except Empty as e: break _val = _data[1] a_url, refers, a_val = _val bResult_list += self.get_jsonPage(a_url,refers,a_val) # 通知queue, task结束 self.val_queue.task_done() except Common.InvalidPageException as e: print '# Invalid page exception:',e # 通知queue, task结束 self.val_queue.task_done() self.crawlRetry(_data) except Common.DenypageException as e: print '# Deny page exception:',e # 通知queue, task结束 self.val_queue.task_done() self.crawlRetry(_data) time.sleep(60) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e # 通知queue, task结束 self.val_queue.task_done() self.crawlRetry(_data) time.sleep(random.uniform(10,30)) except Exception as e: print '# exception err:',e # 通知queue, task结束 self.val_queue.task_done() if str(e).find('Read timed out') != -1: self.crawlRetry(_data) elif str(e).find('Name or service not known') != -1: self.crawlRetry(_data) time.sleep(random.uniform(10,30)) return bResult_list # 通过数据接口获取每一页的数据 def get_jsonPage(self, url, refers='', a_val=()): bResult_list = [] ts = str(int(time.time()*1000)) + '_' + str(random.randint(0,9999)) p_url = url + '&_ksTS=%s'%ts #print p_url result = self.get_jsonData(p_url, refers) bResult_list.append((result,)+a_val) # 分页从接口中获取数据 totalPage = 1 if type(result) is dict and result.has_key('totalPage'): totalPage = int(result['totalPage']) elif type(result) is str: m = re.search(r'"totalPage":(\d+),', result, flags=re.S) if m: totalPage = int(m.group(1)) if totalPage > 1: for page_i in range(2, totalPage+1): ts = str(int(time.time()*1000)) + '_' + str(random.randint(0,9999)) #p_url = re.sub('page=\d+&', 'page=%d&'%page_i, p_url) m = re.search(r'page=\d+&',p_url) if m: p_url = re.sub('page=\d+&', 'page=%d&'%page_i, p_url) else: p_url = p_url + '&page=%d'%page_i p_url = re.sub('&_ksTS=\d+_\d+', '&_ksTS=%s'%ts, p_url) result = self.get_jsonData(p_url, refers) if result: bResult_list.append((result,)+a_val) return bResult_list # 获取每一页数据 def get_jsonData(self, url, refers=''): result = None b_page = self.crawler.getData(url, refers) if not b_page or b_page == '': raise Common.InvalidPageException("# Jsonpage get_jsonData: not get jsondata url:%s."%(url)) try: b_page = re.sub('^]', '', b_page) result = json.loads(b_page) except Exception as e: print '# exception err in get_jsonData load json:',e print '# return string:',b_page return b_page return result # 解析每一页的数据 def parser_brandjson(self, bResult_list, a_val=None): print '# brand activities parse json start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 获取多线程需要的字段val act_valList = [] # 前一页的数据量,用于计算活动所在的位置 prepage_count = 0 for page in bResult_list: page_info = page[0] activities = [] currentPage = 1 if type(page_info) is dict and page_info.has_key('brandList') and page_info['brandList'] != []: activities = page_info['brandList'] if page_info.has_key('currentPage'): currentPage = int(page_info['currentPage']) elif type(page_info) is str: m = re.search(r'"brandList":\[(.+?}})\]', page_info, flags=re.S) if m: brandlist_info = m.group(1) p = re.compile(r'({"baseInfo":.+?}})') for brand_info in p.finditer(brandlist_info): brand = brand_info.group(1) activities.append(brand) m = re.search(r'"currentPage":(\d+),', page_info, flags=re.S) if m: currentPage = int(m.group(1)) else: continue else: continue print '# brand every page num:',len(activities) b_position_start = 0 if currentPage > 1: b_position_start = (currentPage - 1) * prepage_count else: # 保存前一页的数据条数 prepage_count = len(activities) for i in range(0,len(activities)): activity = activities[i] if a_val: val = (activity, page[1], page[2], (b_position_start+i+1)) + a_val else: val = (activity, page[1], page[2], (b_position_start+i+1)) act_valList.append(val) print '# brand activities parse json end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return act_valList # 解析每一页的商品数据 def parser_itemjson(self, iResult_list): print '# items parse json start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 获取多线程需要的字段val item_valList = [] # 前一页的数据量,用于计算商品所在的位置 prepage_count = 0 for page in iResult_list: page_info = page[0] a_val = page[1:] items = [] currentPage = 1 if type(page_info) is dict and page_info.has_key('itemList') and page_info['itemList'] != []: items = page_info['itemList'] if page_info.has_key('currentPage'): currentPage = int(page_info['currentPage']) elif type(page_info) is str: m = re.search(r'"itemList":\[(.+?}})\]', page_info, flags=re.S) if m: itemlist_info = m.group(1) p = re.compile(r'({"baseinfo":.+?}})',re.I) for item_info in p.finditer(itemlist_info): item = item_info.group(1) items.append(item) m = re.search(r'"currentPage":(\d+),', page_info, flags=re.S) if m: currentPage = int(m.group(1)) else: continue else: continue print '# item every page num:',len(items) i_position_start = 0 if currentPage > 1: i_position_start = (currentPage - 1) * prepage_count else: # 保存前一页的数据条数 prepage_count = len(items) for i in range(0,len(items)): item = items[i] if a_val: item_val = (item,) + (a_val + ((i_position_start+i+1),)) else: item_val = (item, (i_position_start+i+1)) item_valList.append(item_val) print '# items parse json end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return item_valList
class RetryCrawler(): '''A class of retry crawl data''' def __init__(self): # 抓取设置 self.crawler = TBCrawler() # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' # wait time self.w_time = 1 # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def getData(self, url, refers='', max_retry=20): page = '' retry = 1 while True: try: page = self.crawler.getData(url, refers) break except Common.InvalidPageException as e: if retry >= max_retry: break retry += 1 print '# Invalid page exception:',e time.sleep(self.w_time*retry) except Common.DenypageException as e: if retry >= max_retry: break retry += 1 print '# Deny page exception:',e # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: if retry >= max_retry: break retry += 1 print '# System busy exception:',e time.sleep(self.w_time*retry) except Exception as e: print '# exception err in retry crawler:',e if str(e).find('Read timed out') != -1: if retry >= max_retry: break retry += 1 time.sleep(self.w_time*retry) elif str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution'): if retry >= max_retry: break retry += 1 # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) else: break return page
class TCWorker: """A class of tc worker""" def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = "ikuai" # self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type) def init_log(self, _obj, _crawl_type): if not Logger.logger: loggername = "other" filename = "crawler_%s" % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) if _obj == "channel": loggername = "channel" filename = "add_%s_%s" % (_crawl_type, time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) # elif _obj == 'item': Logger.config_logging(loggername, filename) # To dial router def dialRouter(self, _type, _obj): try: _module = "%s_%s" % (_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: Common.log("# To dial router exception: %s" % e) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg["retry"] += 1 _retry = msg["retry"] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == "channel": max_time = Config.channel_crawl_retry elif _obj == "item": max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: # self.push_back(self.giveup_items, msg) Common.log("# retry too many time, no get msg:") Common.log(msg) # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == "channel": self.run_channel(msg) else: Common.log("# crawlPage unknown obj = %s" % _obj) except Common.InvalidPageException as e: Common.log("# Invalid page exception: %s" % e) self.crawlRetry(_key, msg) except Common.DenypageException as e: Common.log("# Deny page exception: %s" % e) self.crawlRetry(_key, msg) # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: Common.log("# System busy exception: %s" % e) self.crawlRetry(_key, msg) time.sleep(random.uniform(10, 30)) except Common.RetryException as e: Common.log("# Retry exception: %s" % e) if self.giveup_val: msg["val"] = self.giveup_val self.crawlRetry(_key, msg) time.sleep(random.uniform(20, 30)) except Exception as e: Common.log("# exception err: %s" % e) self.crawlRetry(_key, msg) Common.traceback_log() if str(e).find("Read timed out") == -1: # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) def run_channel(self, msg): msg_val = msg["val"] c = Channel() c.antPage(msg_val) # self.items = c.channel_items self.run_items(c) # 并行获取商品 def run_items(self, chan): Common.log("# Items start, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) # 多线程 控制并发的线程数 Common.log("# Items num: %d" % len(chan.channel_items)) if len(chan.channel_items) > Config.item_max_th: m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th) else: m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items)) m_itemsObj.createthread() m_itemsObj.putItems(chan.channel_items) m_itemsObj.run() item_list = m_itemsObj.items Common.log("# find Items num: %d" % len(chan.channel_items)) Common.log("# crawl Items num: %d" % len(item_list)) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: Common.log("# giveup Items num: %d" % len(giveup_items)) raise Common.RetryException("# run_items: some items retry more than max times..") Common.log("# Items end, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) def process(self, _obj, _crawl_type, _val=None): # self.processMulti(_obj, _crawl_type, _val) self.processOne(_obj, _crawl_type, _val) def processOne(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == "channel": M = 2 n = 0 while True: if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: Common.log("# not get queue of key: %s" % _key) Common.log("# all get num of item in queue: %d" % n) break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: Common.log("# exception err in process of TCWorker: %s , key: %s" % (e, _key)) Common.log(_msg) def processMulti(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) try: self.crawlPageMulti(_obj, _crawl_type, _key, _val) except Exception as e: Common.log("# exception err in processMulti of TCWorker: %s, key: %s" % (e, _key)) # To crawl page def crawlPageMulti(self, _obj, _crawl_type, _key, _val): self.run_multiitems(_key, _val) # Common.log('# crawlPageMulti unknown obj = %s' % _obj) def run_multiitems(self, _key, _val): mitem = TCItemRedisM(_key, self._crawl_type, 20, _val) mitem.createthread() mitem.run() item_list = mitem.items Common.log("# crawl Items num: %d" % len(item_list))
class TCItemM(MyThread): '''A class of tc item thread manager''' def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.TC_Spot # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self._q_type = _q_type # new:新增商品 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.item_crawl_retry: _data = (_retry, _val) self.put_q(_data) else: self.push_back(self.giveup_items, _val) Common.log('# retry too many times, no get item:') Common.log(_val) # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertTCItem(iteminfosql_list) return True return False # To crawl item def crawl(self): # item sql list _iteminfosql_list = [] _itemdaysql_list = [] _itemhoursql_list = [] _itemupdatesql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] break item = None obj = 'item' if self._q_type == 'spot': # 新商品实例 item = Item() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len(tickets) > 0: self.mysqlAccess.insertTCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] # 存网页 #if item: # _pages = item.outItemPage(obj, self._q_type) # self.mongofsAccess.insertTCPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) Common.log('# Invalid page exception: %s' % e) # 通知queue, task结束 self.queue.task_done() except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: Common.log(_data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,40))
class JHSGroupItemWorker(): '''A class of JHS group item channel worker''' def __init__(self): # jhs group item type self.worker_type = Config.JHS_GroupItem self.jhs_type = Config.JHS_TYPE # queue type # message self.message = Message() # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() # DB # mysql access self.mysqlAccess = MysqlAccess() # redis queue self.redisQueue = RedisQueue() # redis access self.redisAccess = RedisAccess() # mongodb fs access self.mongofsAccess = MongofsAccess() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e def push_back_list(self, L, v): L.extend(v) def push_back_val(self, L, v): L.append(v) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg def crawlPage(self, _key, msg, _val): try: if self._obj == 'groupitemcat': self.run_category(msg, _val) else: print '# crawlPage unknown obj = %s' % self._obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_category(self, msg, _val): category_val = msg["val"] refers = _val c_url,c_name,c_id = category_val print c_url,c_name,c_id page = self.crawler.getData(c_url, refers) page_val = (page,c_name,c_id) ajax_url_list = self.getAjaxurlList(page_val,c_url) if len(ajax_url_list) > 0: self.get_jsonitems(ajax_url_list) # get json ajax url def getAjaxurlList(self, page_val, refers=''): url_list = [] page, c_name, c_id = page_val p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = c_name a_url = a_info.group(1).replace('amp;','') info = a_info.group(2) m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S) if m: c_subNav = m.group(1).strip() a_val = (c_id,c_name,refers,c_subNav) url_list.append((a_url,refers,a_val)) i += 1 return url_list # get item json list in category page from ajax url def get_jsonitems(self, ajax_url_list): # today all items val todayall_item_val = [] # other sub nav items val item_list = [] # process ajax url list item_json_index = 0 # mongo json pages cat_pages = {} for a_url in ajax_url_list: # get json from ajax url Result_list = self.jsonpage.get_json([a_url]) # mongo page json _url,_refers,_val = a_url _c_id = _val[0] time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) # timeStr_jhstype_webtype_itemgroupcat_catid key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id)) cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list)) if Result_list and len(Result_list) > 0: item_result_valList = self.jsonpage.parser_itemjson(Result_list) if item_result_valList and len(item_result_valList) > 0: item_json_index += 1 # the first item list is all online items if item_json_index == 1: if len(item_result_valList) > 0: print '# all online items.....' todayall_item_val = item_result_valList else: self.push_back_list(item_list, item_result_valList) else: print '# not get itemjson parse val list...' if len(item_list) > 0: self.parseItems(item_list) # cat pages json for key in cat_pages.keys(): _pages = (key,cat_pages[key]) self.mongofsAccess.insertJHSPages(_pages) # 解析从接口中获取的商品数据 def parseItems(self, item_list): print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 附加信息 a_val = (self.begin_time,) # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_list) > max_th: m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val) else: m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val) m_itemsObj.createthread() m_itemsObj.putItems(item_list) m_itemsObj.run() _items = m_itemsObj.items self.push_back_list(self.items,_items) print '# queue item num:',len(self.items) print '# parse item num:',len(_items) print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _obj == 'groupitem': self.processMulti(_val) else: self.processOne(_val) def processOne(self, _val=None): i, M = 0, 10 n = 0 while True: try: if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# all get catQ item num:',n print '# not get catQ of key:',_key break time.sleep(10) continue n += 1 self.crawlPage(_key, _msg, _val) except Exception as e: print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg def processMulti(self, _val=None): if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) try: self.crawlPageMulti(_key, _val) except Exception as e: print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key) # To crawl page def crawlPageMulti(self, _key, _val): if self._obj == 'groupitem': self.run_groupitem(_key, _val) else: print '# crawlPageMulti unknown obj = %s' % self._obj def run_groupitem(self, _key, _val): m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val) m_itemQ.createthread() m_itemQ.run() item_list = m_itemQ.items print '# crawl Items num: %d' % len(item_list) # 删除redis数据库过期商品 def delItem(self, _items): for _item in _items: keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.begin_time) # 删除过期的商品 if now_time > end_time: self.redisAccess.delete_jhsitem(keys) # 把商品信息存入redis数据库中 def putItemDB(self, _items): for _item in _items: # 忽略已经存在的商品ID keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): continue # 将商品基础数据写入redis item_val = self.message.itemInfo(_item["r_val"]) val = self.message.itemMsg(item_val) self.redisAccess.write_jhsitem(keys, val) # 更新商品信息 def updateItem(self, _item): keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: item_val = self.message.itemParseInfo(_item["r_val"]) c = False if item["start_time"] != item_val["start_time"]: item["start_time"] = item_val["start_time"] c = True if item["end_time"] != item_val["end_time"]: item["end_time"] = item_val["end_time"] c = True if c: self.redisAccess.write_jhsitem(keys, item) # 查找新商品 def selectNewItems(self, _items): new_items = [] for _item in _items: keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): self.updateItem(_item) continue new_items.append(_item["val"]) return new_items def scanEndItems(self): val = (Common.time_s(self.crawling_time),) _items = self.mysqlAccess.selectJhsGroupItemEnd(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanEndItemsLasthour(self): val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1)) _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums for last hour end:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanAliveItems(self): # 到结束时间后的一个小时 val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1)) # 查找已经开团但是没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemAlive(val) print "# hour all item nums:",len(_items) return _items def scanNotEndItems(self): val = (Common.time_s(self.crawling_time),) # 查找没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val) i = 1 for _item in _items: print i item_juid = str(_item[1]) keys = [self.worker_type, item_juid] item = self.redisAccess.read_jhsitem(keys) print item #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]} #self.redisAccess.write_jhsitem(keys, _new_item) i += 1 def scanCategories(self): category_list = self.mysqlAccess.selectJhsGroupItemCategory() return category_list
class JHSGroupItemCrawlerM(MyThread): '''A class of jhs item thread manager''' def __init__(self, jhs_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_type = jhs_type # h:每小时, i:商品信息详情 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.crawl_retry: _data = (_retry, _val) self.put_q(_data) else: self.push_back(self.giveup_items, _val) print "# retry too many times, no get item:", _val # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertJhsGroupItemInfo(iteminfosql_list) #print '# insert data to database' return True return False # insert item hour def insertItemhour(self, itemhoursql_list, f=False): if f or len(itemhoursql_list) >= Config.item_max_arg: if len(itemhoursql_list) > 0: self.mysqlAccess.insertJhsGroupItemForHour(itemhoursql_list) #print '# insert hour data to database' return True return False # insert item coming def insertItemComing(self, itemsql_list, f=False): if f or len(itemsql_list) >= Config.item_max_arg: if len(itemsql_list) > 0: self.mysqlAccess.insertJhsGroupItemComing(itemsql_list) #print '# insert item coming data to database' return True return False # insert item position def insertItemPosition(self, itemsql_list, f=False): if f or len(itemsql_list) >= Config.item_max_arg: if len(itemsql_list) > 0: self.mysqlAccess.insertJhsGroupItemPosition(itemsql_list) #print '# insert position data to database' return True return False # To crawl item def crawl(self): # item sql list _iteminfosql_list = [] _itemhoursql_list = [] _itemcomingsql_list = [] _itempositionsql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] # hour self.insertItemhour(_itemhoursql_list, True) _itemhoursql_list = [] # coming self.insertItemComing(_itemcomingsql_list, True) _itemcomingsql_list = [] # position self.insertItemPosition(_itempositionsql_list, True) _itempositionsql_list = [] break item = None crawl_type = '' if self.jhs_type == 'hour': # 每小时一次商品实例 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageGroupItemHour(_val) #print '# Hour To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5] crawl_type = 'groupitem' # 汇聚 #self.push_back(self.items, item.outTupleGroupItemHour()) update_Sql,hourSql = item.outTupleGroupItemHour() if update_Sql: self.mysqlAccess.updateJhsGroupItem(update_Sql) _itemhoursql_list.append(hourSql) if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = [] elif self.jhs_type == 'new': # 商品信息 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageGroupItem(_val) #print '# To crawl item val : ', Common.now_s(), _val[0], _val[4], _val[5] crawl_type = 'groupitemnew' # 汇聚 self.push_back(self.items, item.outTupleGroupItem()) iteminfoSql = item.outTupleGroupItem() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: # 通知queue, task结束 self.queue.task_done() continue # 存网页 if item and crawl_type != '': _pages = item.outItemPage(crawl_type) self.mongofsAccess.insertJHSPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: print 'Not item exception :', e # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: print 'Not page exception :', e # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) print 'Invalid page exception :', e # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e #traceback.print_exc() print '#####--Traceback Start--#####' tp,val,td = sys.exc_info() for file, lineno, function, text in traceback.extract_tb(td): print "exception traceback err:%s,line:%s,in:%s"%(file, lineno, function) print text print "exception traceback err:%s,%s,%s"%(tp,val,td) print '#####--Traceback End--#####' self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: print _data # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'item') except Exception as e: print '# DailClient Exception err:', e time.sleep(10) time.sleep(random.uniform(10,30))
class JHSItemM(MyThread): '''A class of jhs item thread manager''' def __init__(self, _q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.worker_type = Config.JHS_Brand # message self.message = Message() # db self.mysqlAccess = MysqlAccess() # mysql access self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self._q_type = _q_type # main:新增商品, day:每天一次的商品, hour:每小时一次的商品, update:更新 # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To merge item def mergeAct(self, item, prev_item): if prev_item: if not item.item_position or item.item_position == 0: item.item_position = prev_item["item_position"] if not item.item_juName or item.item_juName == '': item.item_juName = prev_item["item_juname"] if not item.item_juDesc or item.item_juDesc == '': item.item_juDesc = prev_item["item_judesc"] if not item.item_juPic_url or item.item_juPic_url == '': item.item_juPic_url = prev_item["item_jupic_url"] if not item.item_url or item.item_url == '': item.item_url = prev_item["item_url"] if not item.item_oriPrice or item.item_oriPrice == '': item.item_oriPrice = prev_item["item_oriprice"] if not item.item_actPrice or item.item_actPrice == '': item.item_actPrice = prev_item["item_actprice"] if not item.item_discount or item.item_discount == '': item.item_discount = prev_item["item_discount"] if not item.item_coupons or item.item_coupons == []: item.item_coupons = prev_item["item_coupons"].split(Config.sep) if not item.item_promotions or item.item_promotions == []: item.item_promotions = prev_item["item_promotions"].split(Config.sep) if not item.item_remindNum or item.item_remindNum == '': item.item_remindNum = prev_item["item_remindnum"] if not item.item_isLock_time or item.item_isLock_time == '': if prev_item["item_islock_time"] and prev_item["item_islock_time"] != '': item.item_isLock_time = Common.str2timestamp(prev_item["item_islock_time"]) item.item_isLock = prev_item["item_islock"] if not item.item_starttime or item.item_starttime == 0.0: if prev_item["start_time"] and prev_item["start_time"] != '': item.item_starttime = Common.str2timestamp(prev_item["start_time"]) if not item.item_endtime or item.item_endtime == 0.0: if prev_item["end_time"] and prev_item["end_time"] != '': item.item_endtime = Common.str2timestamp(prev_item["end_time"]) # To put item redis db def putItemDB(self, item): # redis keys = [self.worker_type, str(item.item_juId)] prev_item = self.redisAccess.read_jhsitem(keys) self.mergeAct(item, prev_item) val = item.outTupleForRedis() msg = self.message.jhsitemMsg(val) self.redisAccess.write_jhsitem(keys, msg) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.item_crawl_retry: _data = (_retry, _val) self.put_q(_data) else: self.push_back(self.giveup_items, _val) print "# retry too many times, no get item:", _val # insert item info def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertJhsItemInfo(iteminfosql_list) #print '# insert data to database' return True return False # insert item day def insertItemday(self, itemdaysql_list, f=False): if f or len(itemdaysql_list) >= Config.item_max_arg: if len(itemdaysql_list) > 0: self.mysqlAccess.insertJhsItemForDay(itemdaysql_list) #print '# day insert data to database' return True return False # insert item hour def insertItemhour(self, itemhoursql_list, f=False): if f or len(itemhoursql_list) >= Config.item_max_arg: if len(itemhoursql_list) > 0: self.mysqlAccess.insertJhsItemForHour(itemhoursql_list) #print '# hour insert data to database' return True return False # update item lock start-end time def updateItemLockStartEndtime(self, itemsql): if itemsql: self.mysqlAccess.updateJhsItemLockStartEndtime(itemsql) #print '# update data to database' def updateItems(self, itemsql_list, f=False): if f or len(itemsql_list) >= Config.item_max_arg: if len(itemsql_list) > 0: self.mysqlAccess.updateJhsItems(itemsql_list) #print '# update data to database' return True return False # To crawl item def crawl(self): # item sql list _iteminfosql_list = [] _itemdaysql_list = [] _itemhoursql_list = [] _itemupdatesql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] # day self.insertItemday(_itemdaysql_list, True) _itemdaysql_list = [] # hour self.insertItemhour(_itemhoursql_list, True) _itemhoursql_list = [] # update #self.updateItems(_itemupdatesql_list, True) #_itemupdatesql_list = [] break item = None if self._q_type == 'main': # 新商品实例 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 # redis self.putItemDB(item) self.push_back(self.items, item.outTuple()) # 入库 iteminfoSql = item.outTuple() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] elif self._q_type == 'day': # 每天商品实例 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageDay(_val) # 汇聚 self.push_back(self.items, item.outSqlForDay()) # 入库 daySql = item.outSqlForDay() _itemdaysql_list.append(daySql) if self.insertItemday(_itemdaysql_list): _itemdaysql_list = [] elif self._q_type == 'hour': # 每小时商品实例 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageHour(_val) # 汇聚 # redis self.putItemDB(item) self.push_back(self.items, item.outTupleHour()) # 入库 updateSql = item.outSqlForUpdate() if updateSql: self.mysqlAccess.updateJhsItem(updateSql) hourSql = item.outSqlForHour() _itemhoursql_list.append(hourSql) if self.insertItemhour(_itemhoursql_list): _itemhoursql_list = [] elif self._q_type == 'update': # 更新商品 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageUpdate(_val) # 汇聚 # redis self.putItemDB(item) self.push_back(self.items, item.outSqlForUpdate()) # 入库 updateSql = item.outSqlForUpdate() if updateSql: self.mysqlAccess.updateJhsItem(updateSql) elif self._q_type == 'check': # check商品实例 item = JHSItem() _val = _data[1] if self.a_val: _val = _val + self.a_val item.antPageUpdate(_val) # 汇聚 # redis self.putItemDB(item) self.push_back(self.items, item.outSqlForUpdate()) # 入库 updateSql = item.outSqlForUpdate() if updateSql: self.mysqlAccess.updateJhsItem(updateSql) # 存网页 if item: _pages = item.outItemPage(self._q_type) self.mongofsAccess.insertJHSPages(_pages) # 延时 time.sleep(1) # 通知queue, task结束 self.queue.task_done() except Common.NoItemException as e: print 'Not item exception :', e # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: print 'Not page exception :', e # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) print 'Invalid page exception :', e # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() #if str(e).find('Name or service not known') != -1 or str(e).find('Temporary failure in name resolution') != -1: # print _data # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'item') except Exception as e: print '# DailClient Exception err:', e time.sleep(10) time.sleep(random.uniform(10,40))
class JHSActM(MyThread): '''A class of jhs activity item thread manager''' def __init__(self, jhs_type, thread_num = 15, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() # db self.mysqlAccess = MysqlAccess() # mysql access self.mongofsAccess = MongofsAccess() # mongodb fs access # appendix val self.a_val = a_val # jhs queue type self.jhs_type = jhs_type # 1:即将上线品牌团频道页, 2:检查每天还没结束的活动, 3:新增活动 # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: print '# To dial router exception :', e def push_back(self, L, v): if self.mutex.acquire(1): #self.items.append(v) L.append(v) self.mutex.release() def putItem(self, _item): self.put_q((0, _item)) def putItems(self, _items): for _item in _items: self.put_q((0, _item)) # To crawl retry def crawlRetry(self, _data): if not _data: return _retry, _val = _data _retry += 1 if _retry < Config.act_crawl_retry: _data = (_retry, _val) self.put_q(_data) else: print "# retry too many times, no get item:", _val # insert act def insertAct(self, actsql_list, f=False): if f or len(actsql_list) >= Config.act_max_arg: if len(actsql_list) > 0: self.mysqlAccess.insertJhsAct(actsql_list) return True return False # insert act day def insertActday(self, actdaysql_list, f=False): if f or len(actdaysql_list) >= Config.act_max_arg: if len(actdaysql_list) > 0: self.mysqlAccess.insertJhsActDayalive(actdaysql_list) return True return False # insert act hour def insertActhour(self, acthoursql_list, f=False): if f or len(acthoursql_list) >= Config.act_max_arg: if len(acthoursql_list) > 0: self.mysqlAccess.insertJhsActHouralive(acthoursql_list) return True return False # insert act coming def insertActcoming(self, actcomingsql_list, f=False): if f or len(actcomingsql_list) >= Config.act_max_arg: if len(actcomingsql_list) > 0: self.mysqlAccess.insertJhsActComing(actcomingsql_list) return True return False # To crawl item def crawl(self): # sql list #_actsql_list, _actdaysql_list, _acthoursql_list = [], [], [] _actcomingsql_list = [] while True: _data = None try: try: # 取队列消息 _data = self.get_q() except Empty as e: # 队列为空,退出 #print '# queue is empty', e self.insertActcoming(_actcomingsql_list, True) _actcomingsql_list = [] break item = None crawl_type = '' if self.jhs_type == 1: # 品牌团实例 即将上线 item = JHSAct() # 信息处理 _val = _data[1] item.antPageComing(_val) print '# To crawl coming activity val : ', Common.now_s(), _val[1], _val[2], _val[3] crawl_type = 'coming' # 汇聚 self.push_back(self.items, item.outTupleForComing()) crawling_confirm,sql = item.outTupleForComing() # 入库 if crawling_confirm == 1: _actcomingsql_list.append(sql) if self.insertActcoming(_actcomingsql_list): _actcomingsql_list = [] elif self.jhs_type == 2: # 品牌团实例 检查活动新加商品 item = JHSAct() # 信息处理 _val = _data[1] item.antPageHourcheck(_val) #print '# To check activity val : ', Common.now_s(), _val[0], _val[1] crawl_type = 'hourcheck' # 汇聚 self.push_back(self.items, item.outTupleForHourcheck()) elif self.jhs_type == 3: # 品牌团实例 item = JHSAct() # 信息处理 _val = _data[1] item.antPage(_val) #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3] crawl_type = 'brand' # 汇聚 self.push_back(self.items, item.outTuple()) elif self.jhs_type == 4: # 还没有开团的品牌团实例 item = JHSAct() # 信息处理 _val = _data[1] item.antPageMain(_val) #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3] crawl_type = 'main' # 汇聚 self.push_back(self.items, item.outTuple()) elif self.jhs_type == 5: # 解析品牌团数据 item = JHSAct() # 信息处理 _val = _data[1] item.antPageParser(_val) #print '# To crawl activity val : ', Common.now_s(), _val[1], _val[2], _val[3] crawl_type = 'parser' # 汇聚 self.push_back(self.items, item.outTupleParse()) else: self.queue.task_done() continue # 存网页 if item and crawl_type != '': _pages = item.outItemPage(crawl_type) self.mongofsAccess.insertJHSPages(_pages) # 通知queue, task结束 self.queue.task_done() except Common.NoActivityException as e: print 'Not activity exception :', e # 通知queue, task结束 self.queue.task_done() except Common.NoPageException as e: print 'Not page exception :', e # 通知queue, task结束 self.queue.task_done() except Common.InvalidPageException as e: self.crawlRetry(_data) print 'Invalid page exception :', e # 通知queue, task结束 self.queue.task_done() except Exception as e: print 'Unknown exception crawl item :', e Common.traceback_log() self.crawlRetry(_data) # 通知queue, task结束 self.queue.task_done() # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(10) time.sleep(random.uniform(10,30))