def queue_items(self): ''' 从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态 :return: 封装好的消费者动作列表 ''' # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制 # select_queue_sql = """ # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ return_list = [] try: d = DBUtil(config._HAINIU_DB) # u = Util() # ip = u.get_local_ip() # sql = select_queue_sql % (self.fail_times,ip,self.limit) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) if len(select_dict) == 0: return return_list query_ids = [] for record in select_dict: id = record["id"] action = record["action"] params = record["params"] query_ids.append(str(id)) c = HainiuConsumer(id, action, params) return_list.append(c) ids = ",".join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def queue_items(self): ''' 通过悲观锁+事务+更新状态来实现多个机器串行拿取数据, 并把其封装成HainiuConsumerAction对象实例列表返回 ''' select_sql = """ select id,action,params from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update; """ # 更新SQL-拼字符串 update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ c_actions = [] # 用于装id,来更新 ids = [] db_util = DBUtil(_HAINIU_DB) try: # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']] # 屏蔽ip查询的参数 ip = Util().get_local_ip() sql_params = [ 1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM'] ] # ({},{}) res1 = db_util.read_dict(select_sql, sql_params) for row in res1: id = row['id'] ids.append(str(id)) act = row['action'] params = row['params'] c_action = NewsFindConsumerAction(id, act, params) c_actions.append(c_action) if len(ids) > 0: db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def queue_items(self): # 屏蔽ip的查询方式 select_sql=''' select id, action, params from web_queue where type=%s and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update; ''' update_sql=''' update web_queue set is_work=1 where id in(%s); ''' db_util = DBUtil(_ZZ_DB) try: ip = Util().get_local_ip() sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]] res = db_util.read_dict(select_sql, sql_params) actions = [] ids = [] for row in res: id = row["id"] ids.append(str(id)) action = row["action"] params = row["params"] # 封装对象 c_action = WebConsumerAction(id, action, params) actions.append(c_action) if len(actions) != 0: # 更新 is_work=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, err: actions = [] db_util.rollback() traceback.print_exc(err)
def put_queue(page_show_num): db_util = DBUtil(_ZZ_DB) # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_seed_sql = ''' select count(*) from web_seed where status=0; ''' # 分页查询web_seed 表的记录 select_seed_limit_sql = ''' select id,url,category from web_seed where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # 更新web_seed表中的 status update_sql = ''' update web_seed set status=1 where id in(%s); ''' try: sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]] res1 = db_util.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() # 统计web_seed 表符合条件的总记录数 res2 = db_util.read_one(count_seed_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1 # 分页查询 ids = [] for i in range(0, page_num): sql_params = [i * page_show_num, page_show_num] print sql_params res3 = db_util.read_dict(select_seed_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["url"] params = row["category"] type = 1 list1.append((type, action, params)) # 批量插入queue db_util.executemany(insert_queue_sql, list1) # 更新 status=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_util.rollback() traceback.print_exc(err)
def put_seed(): # 统计seed符合条件的记录数 count_queue_sql = ''' select count(*) from web_seed where status=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_exter_sql = ''' select count(*) from web_seed_externally where status=0; ''' # web_seed_externally 表的记录 select_exter_limit_sql = ''' select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s; ''' # 插入seed表记录 insert_seed_sql = ''' insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_externally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_exter_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # hu = HtmlUtil() # u = Util() # 分页插入queue表 try: ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) url = row["a_url"] domain = get_tld(url) # host = hu.get_url_host(url) # md5 = u.get_md5(url) host = row["a_host"] md5 = row["a_md5"] category = row["param"] list1.append((url, md5, domain, host, category)) # 批量插入queue db_uitl.executemany(insert_seed_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计internally表的符合条件的总记录数 count_inner_sql = ''' select count(*) from web_seed_internally where status=0; ''' # web_seed_internally 表的记录 select_inner_limit_sql = ''' select id,a_url,param from web_seed_internally where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) try: sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_inner_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ[ "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # 分页插入queue表 ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["a_url"] params1 = row["param"] type = 2 list1.append((type, action, params1)) # 批量插入queue db_uitl.executemany(insert_queue_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) db_uitl.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_seed_to_queue(page_show_num): ''' 采用分页查询种子表数据,批量导入到hainiu_queue :param page_show_num: 一次查询条数 ''' # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计种子表符合条件的总记录数 select_seed_count_sql = """ select count(*) from hainiu_web_seed where status=0; """ # 分页查询种子表数据SQL select_seed_limit_sql = """ select url, md5, domain, host, category from hainiu_web_seed where status=0 limit %s,%s; """ # insert hainiu_queue sql insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("news_find_queue", "news_find_queue") db_util = DBUtil(_HAINIU_DB) try: #1) 统计hainiu_queue 未处理的记录数 sql_params = [1] # res1 是 () res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None start_time = time.time() #2) 统计种子表符合条件的总记录数 res2 = db_util.read_one(select_seed_count_sql) seed_count = res2[0] # 计算有多少页 page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \ else seed_count / page_show_num + 1 # 分页查询 for i in range(page_num): sql_params = [i * page_show_num, page_show_num] # ({},{},{},{},{}) res3 = db_util.read_dict(select_seed_limit_sql, sql_params) # 插入队列表的数据 insert_queue_values = [] params_dict = {} for row in res3: # url, md5, domain, host, category act = row['url'] md5 = row['md5'] domain = row['domain'] host = row['host'] category = row['category'] params_dict['md5'] = md5 params_dict['domain'] = domain params_dict['host'] = host params_dict['category'] = category params_json = json.dumps(params_dict, ensure_ascii=False, encoding='utf-8') insert_queue_values.append((1, act, params_json)) # 把查询的数据批量插入到队列表 db_util.executemany(insert_queue_sql, insert_queue_values) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time)) except Exception, e: logger.exception(e)
def put_inner_to_queue(): ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计内链接表符合条件的总记录数 select_inner_count_sql = """ select count(*) from hainiu_web_seed_internally where status=0; """ # 分页查询内链接表 select_inner_limit_sql = """ select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE status=0 limit 0,%s; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ # 更新内链接表的status状态 update_inner_status_sql = """ update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None # 统计内链接表符合条件的总记录数 res2 = db_util.read_one(select_inner_count_sql) inner_count = res2[0] # 计算有多少页 page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \ else inner_count / page_show_num + 1 start_time = time.time() # 分页查询 for page in range(page_num): sql_params = [page_show_num] res3 = db_util.read_dict(select_inner_limit_sql, sql_params) # 插入队列表的记录 insert_queue_record = [] # param字典 param_dict = {} # inner表内要进行更新的记录 update_innner_status_record = [] for row in res3: # md5,a_url,a_md5,domain,a_host,a_title md5 = row['md5'] a_url = row['a_url'] a_md5 = row['a_md5'] domain = row['domain'] a_host = row['a_host'] a_title = row['a_title'] # param数据 param_dict['md5'] = md5 param_dict['a_md5'] = a_md5 param_dict['domain'] = domain param_dict['a_host'] = a_host param_dict['a_title'] = a_title param_json = json.dumps(param_dict, ensure_ascii=False, encoding='utf-8') # 将数据放入列表 insert_queue_record.append((2, a_url, param_json)) update_innner_status_record.append((a_md5, md5)) db_util.executemany(insert_queue_sql, insert_queue_record) db_util.executemany(update_inner_status_sql, update_innner_status_record) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()