class Consumer(threading.Thread): ''' 消费线程,用于从队列获得消费动作对象,然后调用消费动作对象的action()进行消费 ''' def __init__(self, queue, thread_name, max_sleep_time, max_retry_num): ''' :param queue: 队列对象 :param thread_name: 消费线程名称 :param sleep_time: 每次消费后的休眠时间 :param max_retry_num: 每次失败后最多的重试次数 :return: ''' # 调用父类初始化对象,这样才能运行run方法 super(self.__class__, self).__init__() self.queue = queue self.thread_name = thread_name self.max_sleep_time = max_sleep_time self.max_retry_num = max_retry_num # 初始化日志 self.logger = LogUtil().get_logger(self.thread_name, self.thread_name) def run(self): ''' 线程体 :return: ''' while True: try: # 随机休眠的时间 random_sleep_time = round( random.uniform(0.2, self.max_sleep_time)) # 线程开始时间 start_time = time.time() # 从队列里取c_action对象 c_action = self.queue.get() # 校验 if not isinstance(c_action, ConsumerAction): raise Exception("%s is not ConsumerAction instance" % c_action) # 调用c_action对象的action 方法消费 result = c_action.action(self.thread_name) # 线程结束时间 end_time = time.time() run_time = end_time - start_time success_flag = result[0] success_str = "SUCCESS" if result[0] else "FAIL" self.logger.info( "thread.name=[%s], run_time=%.2f s, sleep_time=%.2f s, retry_times=%d, " "result=%s, detail=%s" % (self.thread_name, run_time, random_sleep_time, c_action.current_retry_num + 1, success_str, result[1:])) # 如果消费失败,可以进行重试 if not success_flag and c_action.current_retry_num < self.max_retry_num - 1: c_action.current_retry_num += 1 # 把c_action 还回队列 self.queue.put(c_action) # 标记本次从队列里取出的c_action 已经执行完成 self.queue.task_done() # 随机休眠 time.sleep(random_sleep_time) except Exception, err: traceback.print_exc(err) self.logger.exception(err)
class Producer(threading.Thread): def __init__(self, queue, q_name, p_action, p_sleep_time, c_max_num, c_max_sleep_time, c_max_retry_num): ''' :param queue: 队列对象 :param q_name: 队列名称 :param p_action: 生产动作对象 :param p_sleep_time: 每次生产后的休眠时间 :param c_max_num: 消费者的最大线程数 :param c_max_sleep_time: 每次运行后的最大休眠时间 :param c_max_retry_num: 运行失败后的最大重试次数 :return: ''' super(self.__class__, self).__init__() self.queue = queue self.q_name = q_name self.p_action = p_action self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_max_retry_num = c_max_retry_num # 校验p_action是不是ProducerAction的子类的实例对象 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action) # 初始化日志对象 self.logger = LogUtil().get_logger('producer_%s' % self.q_name, 'producer_%s' % self.q_name) def run(self): ''' 线程体 :return: ''' actions = [] while True: try: # 线程开始时间 start_time = time.time() # 通过p_action 生产消费动作对象列表 if len(actions) == 0: actions = self.p_action.queue_items() # 本次生产了多少对象 total_num = len(actions) self.logger.info( 'queue.name = [producer_%s], current time produce %d actions' % (self.q_name, total_num)) # 一个一个的放入队列 while True: if len(actions) == 0: break # 通过q.unfinished_tasks的数 小于 消费者最大线程数,就往队列里放 if self.queue.unfinished_tasks < self.c_max_num: c_action = actions.pop() self.queue.put(c_action) # 线程结束时间 end_time = time.time() # 本次从生产到全部放到队列的秒数 run_time = end_time - start_time rate = int(float(total_num) * 60 / run_time) self.logger.info( "queue.name=[producer_%s], total_num=%d, " "producer %d actions/min, sleep_time=%d" % (self.q_name, total_num, rate, self.p_sleep_time)) # 休眠一下 time.sleep(self.p_sleep_time) except Exception, err: traceback.print_exc(err) self.logger.exception(err)
class Producer(threading.Thread): ''' 生产者线程 ''' def __init__(self,queue,action,name,max_num,sleep_time,work_sleep_time,work_try_num): ''' 初始化生产线程 :param queue: 使用的队列 :param action: 生产者动作 :param name: 生产者名称 :param max_num: 启动的消费者的数量 :param sleep_time: 执行下一次生产动作时休息的时间 :param work_sleep_time: 每个消费者的休息时间 :param work_try_num: 每个消费动作允许失败的次数 ''' super(self.__class__,self).__init__() self.queue = queue self.action = action self.name = name self.max_num = max_num self.sleep_time = sleep_time self.work_sleep_time = work_sleep_time self.work_try_num = work_try_num self.rl = LogUtil().get_logger('producer','producer' + self.name) if not isinstance(self.action,base_producer_action.ProducerAction): raise Exception('Action not Producer base') def run(self): #缓存生产者产生的消费动作,用于消费者线程有空闲时进行任务的填充 action_list = [] while True: try: start_time = time.clock() #当缓存消费动作为空时,调用生产动作拿到新的一批消费动作 if len(action_list) == 0: action_list = self.action.queue_items() #日志输出本次的消费动作有多少 totle_times = len(action_list) self.rl.info('get queue %s total items is %s' %(self.name,totle_times)) while True: #当生产者的消费动作都交给了消费者线程时,跳出循环 if len(action_list) == 0: break #得到队列中work状态的消费动作有多少 unfinished_tasks = self.queue.unfinished_tasks #当work状态的消费动作小于消费者线程数时就往队列中派发一个消费动作 if unfinished_tasks <= self.max_num: action = action_list.pop() self.queue.put(action) end_time = time.clock() #计算生产者完成本次生产任务的时间和频次 sec = int(round((end_time - start_time))) min = int(round(sec/float(60))) self.rl.info("put queue %s total items is %s,total time is %s\'s,(at %s items/min)" % \ (self.name,totle_times,sec, int(totle_times) if min == 0 else round(float((totle_times/float(min))),2))) time.sleep(self.sleep_time) except: self.rl.exception() def start_work(self): ''' 启动生产者线程和根据消费者线程的数设置启动对应数量的消费者线程 ''' for i in range(0,self.max_num): qc = queue_consumer.Consumer(self.queue,self.name + '_' + str(i),self.work_sleep_time,self.work_try_num) qc.start() time.sleep(5) self.start()
def xpath_config_file(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler', # 'CHARSET': 'utf8', 'PORT': 3306} d = DBUtil(config._ZZ_DB) #d = DBUtil(_HAINIU_DB) r = redis.Redis('nn1.hadoop', 6379, db=6) #r = redis.Redis('redis.hadoop', 6379, db=6) f = FileUtil() t = TimeUtil() # c = Client("http://nn1.hadoop:50070") time_str = t.now_time(format='%Y%m%d%H%M%S') #local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str local_xpath_file_path = 'E:/python_workspaces/data/xpath_file' + time_str start_cursor = 0 is_finish = True starttime = time.clock() host_set = set() while is_finish: values = set() limit = r.scan(start_cursor, 'total_z:*', 10) if limit[0] == 0: is_finish = False start_cursor = limit[0] for h in limit[1]: host = h.split(":")[1] total_key = h txpath_key = 'txpath_z:%s' % host fxpath_key = 'fxpath_z:%s' % host total = r.get(total_key) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: # print 'txpath:%s' % txpath txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 # print 'txpath_max_num:%s' % txpath_num if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 100: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 100: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '1')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
def action(self): logger = LogUtil().get_logger("download_action", "download_action") #1)把队列中的url的HTML内容下载到文件中,每个消费线程每隔5分钟生成一个新的文件。 r = RequestUtil() # hu = HtmlUtil() u = Util() db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) # 拼接要写入的内容 html = html.replace("\r", "").replace("\n", "\002") str1 = self.act + "\001" + html str2 = u.get_md5(str1) + "\001" + str1 # 成功失败标记 is_success = True # 获取时间 # now_time====>年月日时分秒 now_time = time.strftime("%Y%m%d,%H,%M,%S").split(",") day = now_time[0] hour = now_time[1] minute = int(now_time[2]) for i in range(60, -5, -5): if minute < i: continue minute = i break minute = '0%s' % minute if minute < 10 else minute now_minute = '%s%s%s' % (day, hour, minute) file_names = os.listdir(_LOCAL_DATA_DIR % ('tmp')) logger.info("file_names:%s" % file_names) thread_name = self.consumer_thread_name logger.info("thread_name:%s" % thread_name) last_file_name = '' for file_name in file_names: tmp = file_name.split("#")[0] if tmp == thread_name: last_file_name = file_name break now_file_name = "%s#%s" % (thread_name, now_minute) try: if last_file_name == '' or last_file_name != now_file_name: # 移动老文件 # if last_file_name != '': oldPath = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("oldPath:%s" % oldPath) # if os.path.exists(oldPath) and os.path.getsize(oldPath) > 0: if last_file_name != '': done_file_name = last_file_name + "#" + str( TimeUtil().get_timestamp()) logger.info("last_file_name:%s" % last_file_name) newPath = _LOCAL_DATA_DIR % ("done/") + done_file_name logger.info("newPath:%s" % newPath) shutil.move(oldPath, newPath) # 写入新文件 now_file_name = _LOCAL_DATA_DIR % ("tmp/") + now_file_name # if not os.path.exists(_LOCAL_DATA_DIR+'tmp2/'): # os.mkdir(_LOCAL_DATA_DIR+'tmp2/') logger.info("now_file_name:%s" % now_file_name) f = open(now_file_name, 'a+') f.write(str2) f.close() else: last_file_name = _LOCAL_DATA_DIR % ("tmp/") + last_file_name logger.info("last_file_name:%s" % last_file_name) # 写入老文件时进行换行 insert_str = "\n" + str2 f = open(last_file_name, 'a+') f.write(insert_str) f.close() except Exception, e: is_success = False traceback.print_exc(e)