def __init__(self, config_file='config/cmb_gsxt.conf', province=None): self.worker_list = {} self.config_list = {} self.thread_num = 8 self.province = province self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf self.crawl_flag = 'crawl_online' self.source_table = 'online_all_list' self.tube = '' # 不指定抓取的站点直接抛异常 if province is None or province == '': raise StandardError('province error...') # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 日志信息 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) # 连接mongodb self.source_db = source_db # 初始化worker self.init_worker(self.config_list)
def __init__(self, server_conf=None, log=None, is_open=True): threading.Thread.__init__(self) self.daemon = True self.log = log # 判断是否需要开启消息队列 self.is_open = is_open if not self.is_open: return # 判断是否消息队列已中断 self.is_connect = True # 判断是否需要暂停 self.is_pause = False self.pause_time = self.PAUSE_TIME_LV1 # 输送队列 self.queue = Queue() if server_conf is None: raise StandardError('没有消息队列配置信息...') # 获取消息队列配置 self.server_conf = server_conf # 消息队列 if self.is_open: self.beanstalk = PyBeanstalk(self.server_conf['host'], self.server_conf['port']) else: self.beanstalk = None self.output_tube = self.server_conf['tube']
def parse_task(): parse_beanstalk = PyBeanstalk(beanstalk_parse_conf['host'], beanstalk_parse_conf['port']) parse_tube = beanstalk_parse_conf['tube'] for company_name in data_list: data = { 'company': company_name, 'province': 'hunan', } parse_beanstalk.put(parse_tube, json.dumps(data))
def main(): beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) tube = beanstalk_consumer_conf['tube'] data_str = '湖南汉璟真空玻璃科技有限公司' data = { 'company': data_str, 'province': 'hunan', } print data_str beanstalk.put(tube, json.dumps(data))
def main(): beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) tube = beanstalk_consumer_conf['tube'] for company, info in company_info.iteritems(): data = { 'company': company, 'province': info['province'], } print company beanstalk.put(tube, json.dumps(data))
def crawl_task(): beanstalk_crawl_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400, 'tube': 'gs_hunan_scheduler'} crawl_beanstalk = PyBeanstalk(beanstalk_crawl_conf['host'], beanstalk_crawl_conf['port']) crawl_tube = beanstalk_crawl_conf['tube'] for company_name in data_list: data = { 'company_name': company_name, 'province': 'hunan', } data_str = json.dumps(data) crawl_beanstalk.put(crawl_tube, data_str)
def main(): beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) tube = beanstalk_consumer_conf['tube'] company_list = ['贵州大龙帝国网吧', '罗甸县网络帝国网咖', '玉屏国网线下百货店', '帝国网络会所', '玉屏县帝国网吧'] for company_name in company_list: data = { 'company_name': company_name, 'province': 'guizhou', } data_str = json.dumps(data) print data_str beanstalk.put(tube, data_str)
def __init__(self, config_file='config/online_gsxt_parse.conf'): self.worker_list = {} self.config_list = {} self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 开启日志 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) self.tube = self.beanstalk_consumer_conf['tube'] # 初始化worker self.init_worker(self.config_list)
class StartTaskCrawler(object): def __init__(self, config_file='config/online_gsxt_parse.conf'): self.worker_list = {} self.config_list = {} self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 开启日志 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) self.tube = self.beanstalk_consumer_conf['tube'] # 初始化worker self.init_worker(self.config_list) # def __del__(self): # merge_mq.close() # merge_mq.join() def init_worker(self, config_list): self.log.info('初始化worker') for key, value in config_list.iteritems(): self.worker_list[key] = create_crawl_object(value, key) self.log.info('初始化 {key} 完成..'.format(key=key)) self.log.info('初始化全部worker完成...') def load_config(self, config_file): # 读取配置信息 conf_parse = ConfigParser(config_file) # 加载单独省份信息 self.config_list = conf_parse.get_all_session() def task_run(self): self.log.info('服务已开启, 等待消费数据') # 创建线程池 count = 0 while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据不是json格式: data = {data}'.format(data=body)) continue company = json_data.get('company', None) province = json_data.get('province', None) if company is None or province is None: self.log.error('数据格式错误: data = {data}'.format(data=json_data)) continue if company == '': self.log.error('company = 空字符串') continue if province not in self.worker_list: self.log.error('不支持当前省份: province = {province}'.format(province=province)) continue self.log.info('当前消费数据为: company = {company}'.format(company=company)) self.worker_list[province].query_online_task(company) self.log.info('收到退出信号, 安全退出...') def start_worker(self): start_time = time.time() try: self.task_run() except Exception as e: self.log.error('周期任务异常!!!!') self.log.exception(e) end_time = time.time() self.log.info('起始时间: {st}'.format(st=start_time)) self.log.info('结束时间: {et}'.format(et=end_time)) self.log.info('消耗时间: {t}s'.format(t=end_time - start_time))
'host': '172.16.215.16', 'port': 40042, 'db': 'app_data', 'username': '******', 'password': '******', } source_db = MongDb(db_conf['host'], db_conf['port'], db_conf['db'], db_conf['username'], db_conf['password'], log=log) beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400} beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'], beanstalk_consumer_conf['port']) province_zh_to_py = { u'上海': 'shanghai', u'云南': 'yunnan', u'内蒙古': 'neimenggu', u'北京': 'beijing', u'吉林': 'jilin', u'四川': 'sichuan', u'天津': 'tianjin', u'宁夏': 'ningxia', u'安徽': 'anhui', u'山东': 'shandong', u'山西': 'shanxicu', u'广东': 'guangdong', u'广西': 'guangxi',
class MqQueueThread(threading.Thread): PAUSE_COUNT_LV1 = 1000 PAUSE_COUNT_LV2 = 10000 PAUSE_COUNT_LV3 = 50000 PAUSE_COUNT_LV4 = 100000 PAUSE_COUNT_LV5 = 1000000 PAUSE_TIME_LV1 = 1 PAUSE_TIME_LV2 = 3 PAUSE_TIME_LV3 = 10 PAUSE_TIME_LV4 = 20 PAUSE_TIME_LV5 = 300 def __init__(self, server_conf=None, log=None, is_open=True): threading.Thread.__init__(self) self.daemon = True self.log = log # 判断是否需要开启消息队列 self.is_open = is_open if not self.is_open: return # 判断是否消息队列已中断 self.is_connect = True # 判断是否需要暂停 self.is_pause = False self.pause_time = self.PAUSE_TIME_LV1 # 输送队列 self.queue = Queue() if server_conf is None: raise StandardError('没有消息队列配置信息...') # 获取消息队列配置 self.server_conf = server_conf # 消息队列 if self.is_open: self.beanstalk = PyBeanstalk(self.server_conf['host'], self.server_conf['port']) else: self.beanstalk = None self.output_tube = self.server_conf['tube'] def __del__(self): self.log.info('消息队列线程退出...') # 判断是否需要暂停 def is_need_pause(self): try: count = self.beanstalk.get_tube_count(self.output_tube) except Exception as e: self.log.error('获取当前队列数目失败..开启消息队列休眠...') self.log.exception(e) count = self.PAUSE_COUNT_LV1 if count < self.PAUSE_COUNT_LV1: self.is_pause = False self.pause_time = self.PAUSE_TIME_LV1 return self.is_pause = True if count >= self.PAUSE_COUNT_LV5: self.pause_time = self.PAUSE_TIME_LV5 elif count >= self.PAUSE_COUNT_LV4: self.pause_time = self.PAUSE_TIME_LV4 elif count >= self.PAUSE_COUNT_LV3: self.pause_time = self.PAUSE_TIME_LV3 elif count >= self.PAUSE_COUNT_LV2: self.pause_time = self.PAUSE_TIME_LV2 else: self.pause_time = self.PAUSE_TIME_LV1 # 开始休眠 time.sleep(self.pause_time) def close(self): self.queue.put_nowait('@@##$$') self.log.info('发送线程退出指令...') def push_msg(self, msg): if self.is_open: self.queue.put_nowait(str(msg)) def run(self): self.log.info('开始运行消息队列...') while True: # 判断是否打开了消息队列 if not self.is_open: self.log.info('没有打开消息队列, 退出!') break try: msg = self.queue.get() if msg == '@@##$$': break while True: try: self.beanstalk.put(self.output_tube, msg) # 发送前先判断是否需要休眠 # self.is_need_pause() # 设置消息队列连接状态 self.is_connect = True break except SocketError as e: # 设置当前消息队列已中断, 减缓发送数据速度 self.is_connect = False time.sleep(10) self.beanstalk.reconnect() self.log.warn("reconnect beanstalk...") self.log.exception(e) except Exception as e: self.is_connect = False self.log.error('捕获异常休眠...') self.log.exception(e) time.sleep(10) except Exception as e: self.log.info('当前队列大小: size = {size}'.format(size=self.queue.qsize())) self.log.exception(e) time.sleep(5) self.log.info('消息队列线程正常退出.')
class StartTaskCrawler(object): # 最大运行时间 MAX_RUN_TIME = 12 * 60 * 60 def __init__(self, config_file='config/cmb_gsxt.conf', province=None): self.worker_list = {} self.config_list = {} self.thread_num = 8 self.province = province self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf self.crawl_flag = 'crawl_online' self.source_table = 'online_all_list' self.tube = '' # 不指定抓取的站点直接抛异常 if province is None or province == '': raise StandardError('province error...') # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 日志信息 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) # 连接mongodb self.source_db = source_db # 初始化worker self.init_worker(self.config_list) def init_worker(self, config_list): for key, value in config_list.iteritems(): self.worker_list[key] = create_crawl_object(value, key) self.log.info('初始化 {key} 完成..'.format(key=key)) def load_config(self, config_file): # 读取配置信息 conf_parse = ConfigParser(config_file) # 加载单独省份信息 config_dict = conf_parse.get_session(self.province) if config_dict is None: raise StandardError('站点信息错误...{province}'.format(province=self.province)) # 更改线程数目 if config_dict.get('thread_num', None) is not None: self.thread_num = int(config_dict['thread_num']) # 改变种子表指向 if config_dict.get('source_table', None) is not None: self.source_table = config_dict['source_table'] else: raise StandardError('没有指定原始种子表: source_table') # 获得beanstalk配置信息 config = eval(config_dict.get('beanstalk_consumer_conf', 'None')) if config is not None: self.beanstalk_consumer_conf = config # 标志位 crawl_flag = config_dict.get('crawl_flag', 'crawl_online') if crawl_flag is not None: self.crawl_flag = crawl_flag # 标志位 consumer_tube = config_dict.get('consumer_tube', '') if consumer_tube is not None and consumer_tube != '': self.tube = consumer_tube else: raise StandardError('没有tube!!!') # 添加到配置列表 self.config_list[self.province] = config_dict def task_run(self): result_list = [] # 创建协程池 if not is_debug: self.pool = gevent.pool.Pool(self.thread_num) else: self.pool = ThreadPool(processes=self.thread_num) self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num)) self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province)) # 创建线程池 count = 0 start_run_time = time.time() while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据格式错误: msg = {msg}'.format(msg=body)) time.sleep(5) continue province = json_data.get('province') if province is None or province == '': self.log.error('没有province: {msg}'.format(msg=body)) continue company_name = json_data.get('company_name') unified_social_credit_code = json_data.get('unified_social_credit_code') start_schedule_time = json_data.get('start_schedule_time', '') if company_name is None and unified_social_credit_code is None: self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body)) continue if company_name is not None and company_name == '': self.log.error('company = 空字符串, data = {data}'.format( data=body)) continue if unified_social_credit_code is not None and unified_social_credit_code == '': self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format( data=body)) continue if province != self.province: self.log.warn('province 不正确: province = {province} data = {body}'.format( province=self.province, body=body)) continue if company_name is not None: self.log.info('当前消费数据为: province = {province} company = {company}'.format( province=province, company=company_name)) elif unified_social_credit_code is not None: self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format( province=province, code=unified_social_credit_code)) # 优先使用企业名单 if company_name is not None: data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } else: data = { '_id': util.generator_id({}, unified_social_credit_code, province), 'unified_social_credit_code': unified_social_credit_code.strip().upper(), 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task, args=(data,)) result_list.append(pool_result) if len(result_list) >= 1000: for result in result_list: result.get() del result_list[:] # 如果达到最大运行时间 则重启服务 run_time = time.time() if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME: break if is_debug: self.pool.close() self.pool.join() for result in result_list: result.get() del result_list[:] del result_list self.log.info('收到退出信号, 安全退出...') def start_worker(self): start_time = time.time() try: self.task_run() except Exception as e: self.log.error('周期任务异常!!!!') self.log.exception(e) exit(1) end_time = time.time() self.log.info('扫描起始时间: {st}'.format(st=start_time)) self.log.info('扫描结束时间: {et}'.format(et=end_time)) self.log.info('扫描消耗时间: {t}s'.format(t=end_time - start_time))