class StartTaskCrawler(object): def __init__(self, config_file='config/online_gsxt_parse.conf'): self.worker_list = {} self.config_list = {} self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 开启日志 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) self.tube = self.beanstalk_consumer_conf['tube'] # 初始化worker self.init_worker(self.config_list) # def __del__(self): # merge_mq.close() # merge_mq.join() def init_worker(self, config_list): self.log.info('初始化worker') for key, value in config_list.iteritems(): self.worker_list[key] = create_crawl_object(value, key) self.log.info('初始化 {key} 完成..'.format(key=key)) self.log.info('初始化全部worker完成...') def load_config(self, config_file): # 读取配置信息 conf_parse = ConfigParser(config_file) # 加载单独省份信息 self.config_list = conf_parse.get_all_session() def task_run(self): self.log.info('服务已开启, 等待消费数据') # 创建线程池 count = 0 while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据不是json格式: data = {data}'.format(data=body)) continue company = json_data.get('company', None) province = json_data.get('province', None) if company is None or province is None: self.log.error('数据格式错误: data = {data}'.format(data=json_data)) continue if company == '': self.log.error('company = 空字符串') continue if province not in self.worker_list: self.log.error('不支持当前省份: province = {province}'.format(province=province)) continue self.log.info('当前消费数据为: company = {company}'.format(company=company)) self.worker_list[province].query_online_task(company) self.log.info('收到退出信号, 安全退出...') def start_worker(self): start_time = time.time() try: self.task_run() except Exception as e: self.log.error('周期任务异常!!!!') self.log.exception(e) end_time = time.time() self.log.info('起始时间: {st}'.format(st=start_time)) self.log.info('结束时间: {et}'.format(et=end_time)) self.log.info('消耗时间: {t}s'.format(t=end_time - start_time))
class StartTaskCrawler(object): # 最大运行时间 MAX_RUN_TIME = 12 * 60 * 60 def __init__(self, config_file='config/cmb_gsxt.conf', province=None): self.worker_list = {} self.config_list = {} self.thread_num = 8 self.province = province self.pool = None self.beanstalk_consumer_conf = beanstalk_consumer_conf self.crawl_flag = 'crawl_online' self.source_table = 'online_all_list' self.tube = '' # 不指定抓取的站点直接抛异常 if province is None or province == '': raise StandardError('province error...') # 没有指定配置文件直接抛异常 if config_file is None or config_file == '': raise StandardError('province error...') # 加载配置 self.load_config(config_file) # 日志信息 self.log = global_log # 开启beanstalk self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port']) # 连接mongodb self.source_db = source_db # 初始化worker self.init_worker(self.config_list) def init_worker(self, config_list): for key, value in config_list.iteritems(): self.worker_list[key] = create_crawl_object(value, key) self.log.info('初始化 {key} 完成..'.format(key=key)) def load_config(self, config_file): # 读取配置信息 conf_parse = ConfigParser(config_file) # 加载单独省份信息 config_dict = conf_parse.get_session(self.province) if config_dict is None: raise StandardError('站点信息错误...{province}'.format(province=self.province)) # 更改线程数目 if config_dict.get('thread_num', None) is not None: self.thread_num = int(config_dict['thread_num']) # 改变种子表指向 if config_dict.get('source_table', None) is not None: self.source_table = config_dict['source_table'] else: raise StandardError('没有指定原始种子表: source_table') # 获得beanstalk配置信息 config = eval(config_dict.get('beanstalk_consumer_conf', 'None')) if config is not None: self.beanstalk_consumer_conf = config # 标志位 crawl_flag = config_dict.get('crawl_flag', 'crawl_online') if crawl_flag is not None: self.crawl_flag = crawl_flag # 标志位 consumer_tube = config_dict.get('consumer_tube', '') if consumer_tube is not None and consumer_tube != '': self.tube = consumer_tube else: raise StandardError('没有tube!!!') # 添加到配置列表 self.config_list[self.province] = config_dict def task_run(self): result_list = [] # 创建协程池 if not is_debug: self.pool = gevent.pool.Pool(self.thread_num) else: self.pool = ThreadPool(processes=self.thread_num) self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num)) self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province)) # 创建线程池 count = 0 start_run_time = time.time() while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据格式错误: msg = {msg}'.format(msg=body)) time.sleep(5) continue province = json_data.get('province') if province is None or province == '': self.log.error('没有province: {msg}'.format(msg=body)) continue company_name = json_data.get('company_name') unified_social_credit_code = json_data.get('unified_social_credit_code') start_schedule_time = json_data.get('start_schedule_time', '') if company_name is None and unified_social_credit_code is None: self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body)) continue if company_name is not None and company_name == '': self.log.error('company = 空字符串, data = {data}'.format( data=body)) continue if unified_social_credit_code is not None and unified_social_credit_code == '': self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format( data=body)) continue if province != self.province: self.log.warn('province 不正确: province = {province} data = {body}'.format( province=self.province, body=body)) continue if company_name is not None: self.log.info('当前消费数据为: province = {province} company = {company}'.format( province=province, company=company_name)) elif unified_social_credit_code is not None: self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format( province=province, code=unified_social_credit_code)) # 优先使用企业名单 if company_name is not None: data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } else: data = { '_id': util.generator_id({}, unified_social_credit_code, province), 'unified_social_credit_code': unified_social_credit_code.strip().upper(), 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task, args=(data,)) result_list.append(pool_result) if len(result_list) >= 1000: for result in result_list: result.get() del result_list[:] # 如果达到最大运行时间 则重启服务 run_time = time.time() if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME: break if is_debug: self.pool.close() self.pool.join() for result in result_list: result.get() del result_list[:] del result_list self.log.info('收到退出信号, 安全退出...') def start_worker(self): start_time = time.time() try: self.task_run() except Exception as e: self.log.error('周期任务异常!!!!') self.log.exception(e) exit(1) end_time = time.time() self.log.info('扫描起始时间: {st}'.format(st=start_time)) self.log.info('扫描结束时间: {et}'.format(et=end_time)) self.log.info('扫描消耗时间: {t}s'.format(t=end_time - start_time))