class TCQ(): '''A class of tc redis queue''' def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # crawler type self.tc_type = Config.TC_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.tc_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.tc_type, self._obj) # clear queue def clearQ(self): self.redisQueue.clear_q(self._key) # 写入redis queue def putQ(self, _msg): self.redisQueue.put_q(self._key, _msg) # 转换msg def putlistQ(self, item_list): for _item in item_list: _val = (0, self._obj, self._q_type) + _item msg = self.message.QueueMsg(self._obj, _val) if msg: self.putQ(msg)
class JHSQ(): '''A class of jhs redis queue''' def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # queue type self.jhs_type = Config.JHS_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.jhs_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.jhs_type, self._obj) # clear queue def clearQ(self): self.redisQueue.clear_q(self._key) # 写入redis queue def putQ(self, _msg): self.redisQueue.put_q(self._key, _msg) # 转换msg def putlistQ(self, item_list): for _item in item_list: _val = (0,self._obj,self.jhs_type) + _item msg = self.message.jhsQueueMsg(self._obj, _val) if msg: self.putQ(msg)
class JMItemRedisM(MyThread): '''A class of jm Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.jm_type = Config.JM_TYPE # jm type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # jm queue type self.jm_queue_type = q_type # main hour... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) # clear item queue def clearItemQ(self): self.redisQueue.clear_q(self._key) # 写入redis queue def putItemQ(self, _msg): _data = (0, _msg) self.redisQueue.put_q(self._key, _data) # 转换msg def putItemlistQ(self, item_list): for _item in item_list: #msg = self.q_message.itemMsg(_item) msg = _item self.putItemQ(msg) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'globalitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert Global item hour def insertGlobalItemHour(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertJMGlobalitemHour(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertGlobalItemHour(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'globalitem' if self.jm_queue_type == 'main': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPageGlobal(_val) # 汇聚 self.push_back(self.items, item.outGlobalSql()) iteminfoSql = item.outGlobalSql() _iteminfosql_list.append(iteminfoSql) if self.insertGlobalItemHour(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.jm_queue_type) # self.mongofsAccess.insertJMPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))