def __init__(self, queue, queue_name, p_action, p_sleep_time, c_max_num, c_max_sleep_time, c_max_retry_num): ''' 初始化数据 :param queue: Queue对象,往该对象里放数据 :param queue_name: 队列名称,每个业务有自己的队列, 可以通过队列名称区分业务 :param p_action: 具体业务的ProducerAction对象 :param p_sleep_time: 生产一次后,下次生产的休眠间隔时间 :param c_max_num: 最大的消费线程数,初始化多少个消费线程取决于该值 :param c_max_sleep_time: 消费者线程消费完后到下次消费时的休眠间隔时间 :param c_max_retry_num: 每个ConsumerAction对象实例如果消费失败了,可以重试, 配置的最大重试次数 ''' # 1)主动调用父类的__init__() super(self.__class__, self).__init__() # 2) 初始化参数 self.queue = queue self.queue_name = queue_name self.p_action = p_action self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_max_try_num = c_max_retry_num # 3)校验p_action的有效性 if not isinstance(p_action, ProducerAction): raise Exception("%s is not ProducerAction instance!" % p_action) # 4)初始化日志对象 self.thread_name = '%s_producer' % self.queue_name self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)
def __init__(self, queue, q_name, p_action, p_sleep_time, c_max_num, c_max_sleep_time, c_max_retry_num): ''' :param queue: 队列对象 :param q_name: 队列名称 :param p_action: 生产动作对象 :param p_sleep_time: 每次生产后的休眠时间 :param c_max_num: 消费者的最大线程数 :param c_max_sleep_time: 每次运行后的最大休眠时间 :param c_max_retry_num: 运行失败后的最大重试次数 :return: ''' super(self.__class__, self).__init__() self.queue = queue self.q_name = q_name self.p_action = p_action self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_max_retry_num = c_max_retry_num # 校验p_action是不是ProducerAction的子类的实例对象 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action) # 初始化日志对象 self.logger = LogUtil().get_logger('producer_%s' % self.q_name, 'producer_%s' % self.q_name)
def __init__(self, url, param, queue_id, pro_flag): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
def __init__(self, kafka_conf): host_list = [host for host in kafka_conf['HOST'].split(',')] random.shuffle(host_list) host_str = ','.join(host_list) self.cache_key = '_'.join((host_str, kafka_conf['TOPIC'])) self.host = host_str self.topic = kafka_conf['TOPIC'] self.rl = LogUtil().get_logger('consumer', 'consumer_kafka')
def __init__(self, text): ''' 初始化消费者实现类 :param text: 消费者要处理的数据 :return: ''' super(self.__class__, self).__init__() self.text = text self.rl = LogUtil().get_base_logger()
def put_inner_to_queue(): redis_util = RedisUtill() ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) db_util.execute_no_commit("set NAMES utf8mb4;") try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] start_time = time.time() if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None inner_count = 0 for ip in ips: key_list = [] scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list) inner_count = inner_count + len(key_list) # 根据key列表上Redis里获取value列表 values = redis_util.get_values_batch_keys(key_list) # 导入hainiu_queue表 insert_queue_record = [] for value in values: queue_param = json.loads(value) a_url = queue_param['a_url'] insert_queue_record.append((2, a_url, value)) db_util.executemany_no_commit(insert_queue_sql, insert_queue_record) db_util.commit() # 把导入表后的key列表从redis里删掉 redis_util.delete_batch(key_list) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def __init__(self, limit, fail_times): ''' 初始化队列的发者 :param limit: 每次从队列中取多少条记录 :param fail_times: 限定取记录的失败次数条件 ''' super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
def __init__(self, id, ac, params): ''' 初始化队列的消费者 :param id: 消息的ID,也就是数据库表里的ID :param ac: 消息的动作信息,也就是数据库表里的action字段 :param params: 消息的动作的附加参数 ''' super(self.__class__, self).__init__() self.id = id self.ac = ac self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name)
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """ insert into hainiu_queue (type,action,params) values(3,%s,%s); """ count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param = param if param is not None else '' values.append((url,param)) id = l[2] id_values.append(str(id)) if id_values.__len__() != 0: random.shuffle(values) d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_tld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def __init__(self, queue, name, sleep_time, work_try_num): ''' 初始化消费线程 :param queue: 使用的队列 :param name: 消费者线程的名称,用其代表消费者的名字 :param sleep_time: 执行下一次消费动作时休息的时间 :param work_try_num: 每个消费动作允许失败的次数 ''' super(self.__class__, self).__init__() self.queue = queue self.name = name self.sleep_time = sleep_time self.work_try_num = work_try_num Consumer._WORK_TRY_NUM = work_try_num self.rl = LogUtil().get_logger( 'consumer', 'consumer' + self.name[:self.name.find("_")])
def send_sms(self, content, phone=config._ALERT_PHONE): """send alter sms for phone with content """ l = LogUtil().get_base_logger() try: send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % ( phone) send_url += urllib.urlencode( {'content': content.decode('utf-8').encode('gbk')}) r = urllib2.urlopen(send_url).read() if '0-OK' != r: l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" % (r, phone, content)) return False except: l.exception() return False return True
def __init__(self, queue, thread_name, max_sleep_time, max_retry_num): ''' :param queue: 队列对象 :param thread_name: 消费线程名称 :param sleep_time: 每次消费后的休眠时间 :param max_retry_num: 每次失败后最多的重试次数 :return: ''' # 调用父类初始化对象,这样才能运行run方法 super(self.__class__, self).__init__() self.queue = queue self.thread_name = thread_name self.max_sleep_time = max_sleep_time self.max_retry_num = max_retry_num # 初始化日志 self.logger = LogUtil().get_logger(self.thread_name, self.thread_name)
def __init__(self, queue, thread_name, max_sleep_time, max_retry_num): ''' 初始化数据 :param queue: Queue对象,从该对象中获取要消费的对象 :param thread_name: 线程名称,在线程中打印日志 :param max_sleep_time: 消费完后到下次消费时的休眠间隔时间 :param max_retry_num: 每个ConsumerAction对象实例如果消费失败了,可以重试, 配置的最大重试次数 ''' # 1)主动调用父类的__init__() super(self.__class__, self).__init__() # 2) 初始化参数 self.queue = queue self.thread_name = thread_name self.max_sleep_time = max_sleep_time self.max_retry_num = max_retry_num # 3)初始化日志对象 self.logger = LogUtil().get_logger(thread_name, thread_name)
def push_queue_items(): inert_sql = """ insert into hainiu_queue (type,params,action) values(1,%s,%s); """ count_sql = """ select count(1) from hainiu_queue where type=1; """ select_sql = """ select id from hainiu_queue where type=1 limit %s,%s; """ rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = inert_sql insert_list = [("aaa", "bbb"), ("dffddf", "awwee")] d.executemany(sql, insert_list) sql = count_sql queue_total = d.read_one(sql)[0] print "queue_total", queue_total page_size = 10 page = (queue_total / page_size) + 1 print "page", page for i in range(0, page): sql = select_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) print "page", i for record in select_list: id = record[0] print id except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def __init__(self,queue,action,name,max_num,sleep_time,work_sleep_time,work_try_num): ''' 初始化生产线程 :param queue: 使用的队列 :param action: 生产者动作 :param name: 生产者名称 :param max_num: 启动的消费者的数量 :param sleep_time: 执行下一次生产动作时休息的时间 :param work_sleep_time: 每个消费者的休息时间 :param work_try_num: 每个消费动作允许失败的次数 ''' super(self.__class__,self).__init__() self.queue = queue self.action = action self.name = name self.max_num = max_num self.sleep_time = sleep_time self.work_sleep_time = work_sleep_time self.work_try_num = work_try_num self.rl = LogUtil().get_logger('producer','producer' + self.name) if not isinstance(self.action,base_producer_action.ProducerAction): raise Exception('Action not Producer base')
def create_seed(): sql = """ insert into web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ url = "https://news.sina.com.cn/" catetory = "新闻" hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._ZZ_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def create_seed(): url = "https://www.autohome.com.cn/all" catetory = "汽车" sql = """ insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def action(self): logger = LogUtil().get_logger("download_action", "download_action") #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。 r = RequestUtil() # hu = HtmlUtil() u = Util() db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 拼接要写入的内容 html = html.replace("\r", "").replace("\n", "\002") str1 = self.act + "\001" + html str2 = u.get_md5(str1) + "\001" + str1 # 成功失败标记 is_success = True # 获取时间 # now_time====>年月日时分秒 now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",") day = now_time[0] hour = now_time[1] minute = int(now_time[2]) for i in range(60, -5, -5): if minute < i: continue minute = i break minute = '0%s' % minute if minute < 10 else minute now_minute = '%s%s%s' % (day, hour, minute) file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp')) logger.info("file_names:%s" % file_names) thread_name = self.consumer_thread_name logger.info("thread_name:%s" % thread_name) last_file_name = '' for file_name in file_names: tmp = file_name.split("#")[0] if tmp == thread_name: last_file_name = file_name break now_file_name = "%s#%s" % (thread_name, now_minute) try: if last_file_name == '' or last_file_name != now_file_name: # 移动老文件 # if last_file_name != '': oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("oldPath:%s" % oldPath) # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0: if last_file_name != '': done_file_name = last_file_name + "#" + str( TimeUtil().get_timestamp()) logger.info("last_file_name:%s" % last_file_name) newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name logger.info("newPath:%s" % newPath) shutil.move(oldPath, newPath) # 写入新文件 now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'): # os.mkdir(_LOCAL_DATA_DIR+'tmp2/') logger.info("now_file_name:%s" % now_file_name) f = open(now_file_name, 'a+') f.write(str2) f.close() else: last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("last_file_name:%s" % last_file_name) # 写入老文件时进行换行 insert_str = "\n" + str2 f = open(last_file_name, 'a+') f.write(insert_str) f.close() except Exception, e: is_success = False traceback.print_exc(e)
def put_inner_to_queue(): ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计内链接表符合条件的总记录数 select_inner_count_sql = """ select count(*) from hainiu_web_seed_internally where status=0; """ # 分页查询内链接表 select_inner_limit_sql = """ select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE status=0 limit 0,%s; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ # 更新内链接表的status状态 update_inner_status_sql = """ update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None # 统计内链接表符合条件的总记录数 res2 = db_util.read_one(select_inner_count_sql) inner_count = res2[0] # 计算有多少页 page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \ else inner_count / page_show_num + 1 start_time = time.time() # 分页查询 for page in range(page_num): sql_params = [page_show_num] res3 = db_util.read_dict(select_inner_limit_sql, sql_params) # 插入队列表的记录 insert_queue_record = [] # param字典 param_dict = {} # inner表内要进行更新的记录 update_innner_status_record = [] for row in res3: # md5,a_url,a_md5,domain,a_host,a_title md5 = row['md5'] a_url = row['a_url'] a_md5 = row['a_md5'] domain = row['domain'] a_host = row['a_host'] a_title = row['a_title'] # param数据 param_dict['md5'] = md5 param_dict['a_md5'] = a_md5 param_dict['domain'] = domain param_dict['a_host'] = a_host param_dict['a_title'] = a_title param_json = json.dumps(param_dict, ensure_ascii=False, encoding='utf-8') # 将数据放入列表 insert_queue_record.append((2, a_url, param_json)) update_innner_status_record.append((a_md5, md5)) db_util.executemany(insert_queue_sql, insert_queue_record) db_util.executemany(update_inner_status_sql, update_innner_status_record) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # # 统计internally表的符合条件的总记录数 # count_inner_sql=''' # select count(*) from web_seed_internally where status=0; # ''' # # # web_seed_internally 表的记录 # select_inner_limit_sql=''' # select id,a_url,param from web_seed_internally where status=0 limit %s,%s; # ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where md5=%s and a_md5=%s; ''' try: # redis_tmp 数据 redis_d = RedisUtill() db_uitl = DBUtil(_ZZ_DB) ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138'] port = '6379' list = [] total_num = 0 is_get_lock = redis_d.get_lock('seed_lock', 10) logger = LogUtil().get_base_logger() sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: logger.info("queue has %d records,not insert!" % total_num1) return None logger.info("正在获取锁...") if is_get_lock: logger.info("获取到锁") start_time = time.time() def scan_limit_to_queue_table(host, port, cursor, match, count): r = redis.Redis(host, port) rs = r.scan(cursor, match, count) # 新游标 next_num = rs[0] # print rs li = rs[1] for i in li: if i.__contains__('a_url'): list.append(i) # 递归出口 if next_num == 0: return None scan_limit_to_queue_table(host, port, next_num, match, count) for ip in ips: scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100) # 分页插入queue表 redis_result = [] up_inner = [] delete_list = [] for k in list: if k.__contains__('a_url'): # 确定同一MD5的其他参数的key param = k.replace('a_url', 'param') md5 = k.replace('a_url', 'md5') a_md5 = k.replace('a_url', 'a_md5') action = redis_d.get_value_for_key(k) params = redis_d.get_value_for_key(param) redis_result.append((2, action, params)) md5_val = redis_d.get_value_for_key(md5) a_md5_val = redis_d.get_value_for_key(a_md5) up_inner.append((md5_val, a_md5_val)) # 添加要删除的列表 delete_list.append(k) delete_list.append(param) delete_list.append(md5) delete_list.append(a_md5) total_num += 1 # 批量插入queue if (len(redis_result) == 5): db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) redis_result = [] up_inner = [] # 提交不满五个的最后一组 db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) # 删除redis_tmp redis_d.delete_batch(delete_list) redis_d.release('seed_lock') logger.info("释放锁") else: logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ') end_time = time.time() run_time = end_time - start_time logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time)) except Exception, err: db_uitl.rollback() redis_d.release('seed_lock') traceback.print_exc(err)
def put_seed_to_queue(page_show_num): ''' 采用分页查询种子表数据,批量导入到hainiu_queue :param page_show_num: 一次查询条数 ''' # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计种子表符合条件的总记录数 select_seed_count_sql = """ select count(*) from hainiu_web_seed where status=0; """ # 分页查询种子表数据SQL select_seed_limit_sql = """ select url, md5, domain, host, category from hainiu_web_seed where status=0 limit %s,%s; """ # insert hainiu_queue sql insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("news_find_queue", "news_find_queue") db_util = DBUtil(_HAINIU_DB) try: #1) 统计hainiu_queue 未处理的记录数 sql_params = [1] # res1 是 () res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None start_time = time.time() #2) 统计种子表符合条件的总记录数 res2 = db_util.read_one(select_seed_count_sql) seed_count = res2[0] # 计算有多少页 page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \ else seed_count / page_show_num + 1 # 分页查询 for i in range(page_num): sql_params = [i * page_show_num, page_show_num] # ({},{},{},{},{}) res3 = db_util.read_dict(select_seed_limit_sql, sql_params) # 插入队列表的数据 insert_queue_values = [] params_dict = {} for row in res3: # url, md5, domain, host, category act = row['url'] md5 = row['md5'] domain = row['domain'] host = row['host'] category = row['category'] params_dict['md5'] = md5 params_dict['domain'] = domain params_dict['host'] = host params_dict['category'] = category params_json = json.dumps(params_dict, ensure_ascii=False, encoding='utf-8') insert_queue_values.append((1, act, params_json)) # 把查询的数据批量插入到队列表 db_util.executemany(insert_queue_sql, insert_queue_values) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time)) except Exception, e: logger.exception(e)
#-*- encoding: utf-8 -*- ''' log_demo.py Created on 21-1-30 上午11:23 Copyright (c) 21-1-30, 海牛学院版权所有. @author: 潘牛 ''' from commons.util.log_util import LogUtil logger1 = LogUtil().get_logger("log_name", "log_file") logger2 = LogUtil().get_logger("log_name", "log_file") # 两个对象指向同一内存地址 print logger1 is logger2 logger1.info("测试 info 级别") logger1.error("测试 error 级别") try: 1 / 0 except Exception, e: logger1.exception(e)
def xpath_config_file(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler', # 'CHARSET': 'utf8', 'PORT': 3306} d = DBUtil(config._HAINIU_DB) # d = DBUtil(_HAINIU_DB) r = redis.Redis('nn1.hadoop', 6379, db=6) # r = redis.Redis('redis.hadoop', 6379, db=6) f = FileUtil() t = TimeUtil() c = Client("http://nn1.hadoop:50070") time_str = t.now_time(format='%Y%m%d%H%M%S') # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str start_cursor = 0 is_finish = True starttime = time.clock() host_set = set() while is_finish: values = set() limit = r.scan(start_cursor, 'total:*', 10) if limit[0] == 0: is_finish = False start_cursor = limit[0] for h in limit[1]: host = h.split(":")[1] total_key = h txpath_key = 'txpath:%s' % host fxpath_key = 'fxpath:%s' % host total = r.get(total_key) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: # print 'txpath:%s' % txpath txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 # print 'txpath_max_num:%s' % txpath_num if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 1: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 1: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '0')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
def redis2Hdfs(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: d = DBUtil(_ZZ_DB) start = 0 is_finish = True host_set = set() f = FileUtil() t = TimeUtil() time_str = t.now_time(format='%Y%m%d%H%M%S') #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str starttime = time.clock() r = redis.Redis('nn1.hadoop', '6379', db=6) while is_finish: values = set() rs = r.scan(start, "total_z:*", 10) # 新游标 start = rs[0] if start ==0: is_finish = False # print rs for i in rs[1]: host = i.split(":")[1] total_key = i txpath_key = 'txpath_z:%s' % host fxpath_key = 'fxpath_z:%s' % host total = r.get(total_key) # 降序排序获得次数(0,1) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: # 返回txpath_key 中txpath[1]的数值 txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 100: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 100: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) # 获得fxpath_key的全部值 fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '1')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
def __init__(self, limit, fail_times): self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)