def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and is_work=0 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) query_ids = [] for record in select_dict: id = record['id'] action = record['action'] params = record['params'] query_ids.append(str(id)) c = OGCConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error() d.rollback() finally: d.close() return return_list
def queue_items(self): ip=Util().get_local_ip() select_seed_sql=""" select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where fail_times<=%s and locate('%s',fail_ip)=0 and status=0 limit 0,%s for update; """ update_queue_sql=""" update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s); """ return_list=[] try: d=DBUtil(config._OGC_DB) sql=select_seed_sql % (self.fail_times,ip,self.limit) select_dict=d.read_dict(sql) # print select_dict query_ids=[] t=TimeUtil() for each in select_dict: id=each['id'] url=each['url'] category=each['category'] domain=each['domain'] host=each['host'] last_crawl_time=str(each['last_crawl_time']) if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\ int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')): #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过 query_ids.append(str(id)) action=url params=category c = NewsFindActionConsumer(id, action, params) return_list.append(c) if query_ids: ids=','.join(query_ids) sql=update_queue_sql % (t.now_time(),ids) print t.now_time(),ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where type=0 and fail_times<=%s and locate('%s',fail_ip)=0 limit 0,%s for update; """ #type=1意思是url已经分配给消费者了 update_queue_sql = """ update hainiu_queue set type=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) print select_dict query_ids = [] for each in select_dict: id = each['id'] url = each['action'] category = each['params'] query_ids.append(str(id)) c = NewsFindQueueConsumer(id, url, category) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where fail_times<=%s and locate('%s',fail_ip)=0 and type=2 limit 0,%s for update; """ #type=3 已被消费者进程拿取过了 update_queue_sql = """ update hainiu_queue set type=3 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) query_ids = [] t = TimeUtil() for each in select_dict: id = each['id'] action = each['action'] params = each['params'] query_ids.append(str(id)) c = DownloadActionConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list