Ejemplo n.º 1
0
    def __init__(self, config_file='config/cmb_gsxt.conf', province=None):
        self.worker_list = {}
        self.config_list = {}
        self.thread_num = 8
        self.province = province
        self.pool = None
        self.beanstalk_consumer_conf = beanstalk_consumer_conf
        self.crawl_flag = 'crawl_online'
        self.source_table = 'online_all_list'
        self.tube = ''

        # 不指定抓取的站点直接抛异常
        if province is None or province == '':
            raise StandardError('province error...')

        # 没有指定配置文件直接抛异常
        if config_file is None or config_file == '':
            raise StandardError('province error...')

        # 加载配置
        self.load_config(config_file)

        # 日志信息
        self.log = global_log

        # 开启beanstalk
        self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port'])

        # 连接mongodb
        self.source_db = source_db

        # 初始化worker
        self.init_worker(self.config_list)
Ejemplo n.º 2
0
    def __init__(self, server_conf=None, log=None, is_open=True):
        threading.Thread.__init__(self)
        self.daemon = True

        self.log = log

        # 判断是否需要开启消息队列
        self.is_open = is_open
        if not self.is_open:
            return

        # 判断是否消息队列已中断
        self.is_connect = True

        # 判断是否需要暂停
        self.is_pause = False
        self.pause_time = self.PAUSE_TIME_LV1

        # 输送队列
        self.queue = Queue()

        if server_conf is None:
            raise StandardError('没有消息队列配置信息...')

        # 获取消息队列配置
        self.server_conf = server_conf

        # 消息队列
        if self.is_open:
            self.beanstalk = PyBeanstalk(self.server_conf['host'], self.server_conf['port'])
        else:
            self.beanstalk = None
        self.output_tube = self.server_conf['tube']
Ejemplo n.º 3
0
def parse_task():
    parse_beanstalk = PyBeanstalk(beanstalk_parse_conf['host'], beanstalk_parse_conf['port'])
    parse_tube = beanstalk_parse_conf['tube']

    for company_name in data_list:
        data = {
            'company': company_name,
            'province': 'hunan',
        }

        parse_beanstalk.put(parse_tube, json.dumps(data))
Ejemplo n.º 4
0
def main():
    beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                            beanstalk_consumer_conf['port'])
    tube = beanstalk_consumer_conf['tube']

    data_str = '湖南汉璟真空玻璃科技有限公司'
    data = {
        'company': data_str,
        'province': 'hunan',
    }
    print data_str
    beanstalk.put(tube, json.dumps(data))
Ejemplo n.º 5
0
def main():
    beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                            beanstalk_consumer_conf['port'])
    tube = beanstalk_consumer_conf['tube']

    for company, info in company_info.iteritems():
        data = {
            'company': company,
            'province': info['province'],
        }
        print company
        beanstalk.put(tube, json.dumps(data))
Ejemplo n.º 6
0
def crawl_task():
    beanstalk_crawl_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400, 'tube': 'gs_hunan_scheduler'}

    crawl_beanstalk = PyBeanstalk(beanstalk_crawl_conf['host'], beanstalk_crawl_conf['port'])
    crawl_tube = beanstalk_crawl_conf['tube']

    for company_name in data_list:
        data = {
            'company_name': company_name,
            'province': 'hunan',
        }
        data_str = json.dumps(data)
        crawl_beanstalk.put(crawl_tube, data_str)
Ejemplo n.º 7
0
def main():
    beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                            beanstalk_consumer_conf['port'])
    tube = beanstalk_consumer_conf['tube']

    company_list = ['贵州大龙帝国网吧', '罗甸县网络帝国网咖', '玉屏国网线下百货店', '帝国网络会所', '玉屏县帝国网吧']

    for company_name in company_list:
        data = {
            'company_name': company_name,
            'province': 'guizhou',
        }
        data_str = json.dumps(data)
        print data_str
        beanstalk.put(tube, data_str)
    def __init__(self, config_file='config/online_gsxt_parse.conf'):
        self.worker_list = {}
        self.config_list = {}
        self.pool = None
        self.beanstalk_consumer_conf = beanstalk_consumer_conf

        # 没有指定配置文件直接抛异常
        if config_file is None or config_file == '':
            raise StandardError('province error...')

        # 加载配置
        self.load_config(config_file)

        # 开启日志
        self.log = global_log

        # 开启beanstalk
        self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port'])
        self.tube = self.beanstalk_consumer_conf['tube']

        # 初始化worker
        self.init_worker(self.config_list)
class StartTaskCrawler(object):
    def __init__(self, config_file='config/online_gsxt_parse.conf'):
        self.worker_list = {}
        self.config_list = {}
        self.pool = None
        self.beanstalk_consumer_conf = beanstalk_consumer_conf

        # 没有指定配置文件直接抛异常
        if config_file is None or config_file == '':
            raise StandardError('province error...')

        # 加载配置
        self.load_config(config_file)

        # 开启日志
        self.log = global_log

        # 开启beanstalk
        self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port'])
        self.tube = self.beanstalk_consumer_conf['tube']

        # 初始化worker
        self.init_worker(self.config_list)

    # def __del__(self):
    #     merge_mq.close()
    #     merge_mq.join()

    def init_worker(self, config_list):
        self.log.info('初始化worker')
        for key, value in config_list.iteritems():
            self.worker_list[key] = create_crawl_object(value, key)
            self.log.info('初始化 {key} 完成..'.format(key=key))
        self.log.info('初始化全部worker完成...')

    def load_config(self, config_file):

        # 读取配置信息
        conf_parse = ConfigParser(config_file)

        # 加载单独省份信息
        self.config_list = conf_parse.get_all_session()

    def task_run(self):
        self.log.info('服务已开启, 等待消费数据')
        # 创建线程池
        count = 0
        while True:
            if not is_running:
                break

            job = self.beanstalk.reserve(self.tube, 3)
            if job is not None:
                count += 1
                body = job.body
                job.delete()
                self.log.info('当前消费数据索引: {count}'.format(count=count))

                json_data = util.json_loads(body)
                if json_data is None:
                    self.log.error('数据不是json格式: data = {data}'.format(data=body))
                    continue

                company = json_data.get('company', None)
                province = json_data.get('province', None)
                if company is None or province is None:
                    self.log.error('数据格式错误: data = {data}'.format(data=json_data))
                    continue

                if company == '':
                    self.log.error('company = 空字符串')
                    continue

                if province not in self.worker_list:
                    self.log.error('不支持当前省份: province = {province}'.format(province=province))
                    continue

                self.log.info('当前消费数据为: company = {company}'.format(company=company))
                self.worker_list[province].query_online_task(company)

        self.log.info('收到退出信号, 安全退出...')

    def start_worker(self):
        start_time = time.time()

        try:
            self.task_run()
        except Exception as e:
            self.log.error('周期任务异常!!!!')
            self.log.exception(e)

        end_time = time.time()
        self.log.info('起始时间: {st}'.format(st=start_time))
        self.log.info('结束时间: {et}'.format(et=end_time))
        self.log.info('消耗时间: {t}s'.format(t=end_time - start_time))
Ejemplo n.º 10
0
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******',
}

source_db = MongDb(db_conf['host'],
                   db_conf['port'],
                   db_conf['db'],
                   db_conf['username'],
                   db_conf['password'],
                   log=log)

beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400}
beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                        beanstalk_consumer_conf['port'])

province_zh_to_py = {
    u'上海': 'shanghai',
    u'云南': 'yunnan',
    u'内蒙古': 'neimenggu',
    u'北京': 'beijing',
    u'吉林': 'jilin',
    u'四川': 'sichuan',
    u'天津': 'tianjin',
    u'宁夏': 'ningxia',
    u'安徽': 'anhui',
    u'山东': 'shandong',
    u'山西': 'shanxicu',
    u'广东': 'guangdong',
    u'广西': 'guangxi',
Ejemplo n.º 11
0
class MqQueueThread(threading.Thread):
    PAUSE_COUNT_LV1 = 1000
    PAUSE_COUNT_LV2 = 10000
    PAUSE_COUNT_LV3 = 50000
    PAUSE_COUNT_LV4 = 100000
    PAUSE_COUNT_LV5 = 1000000

    PAUSE_TIME_LV1 = 1
    PAUSE_TIME_LV2 = 3
    PAUSE_TIME_LV3 = 10
    PAUSE_TIME_LV4 = 20
    PAUSE_TIME_LV5 = 300

    def __init__(self, server_conf=None, log=None, is_open=True):
        threading.Thread.__init__(self)
        self.daemon = True

        self.log = log

        # 判断是否需要开启消息队列
        self.is_open = is_open
        if not self.is_open:
            return

        # 判断是否消息队列已中断
        self.is_connect = True

        # 判断是否需要暂停
        self.is_pause = False
        self.pause_time = self.PAUSE_TIME_LV1

        # 输送队列
        self.queue = Queue()

        if server_conf is None:
            raise StandardError('没有消息队列配置信息...')

        # 获取消息队列配置
        self.server_conf = server_conf

        # 消息队列
        if self.is_open:
            self.beanstalk = PyBeanstalk(self.server_conf['host'], self.server_conf['port'])
        else:
            self.beanstalk = None
        self.output_tube = self.server_conf['tube']

    def __del__(self):
        self.log.info('消息队列线程退出...')

    # 判断是否需要暂停
    def is_need_pause(self):
        try:
            count = self.beanstalk.get_tube_count(self.output_tube)
        except Exception as e:
            self.log.error('获取当前队列数目失败..开启消息队列休眠...')
            self.log.exception(e)
            count = self.PAUSE_COUNT_LV1

        if count < self.PAUSE_COUNT_LV1:
            self.is_pause = False
            self.pause_time = self.PAUSE_TIME_LV1
            return

        self.is_pause = True
        if count >= self.PAUSE_COUNT_LV5:
            self.pause_time = self.PAUSE_TIME_LV5
        elif count >= self.PAUSE_COUNT_LV4:
            self.pause_time = self.PAUSE_TIME_LV4
        elif count >= self.PAUSE_COUNT_LV3:
            self.pause_time = self.PAUSE_TIME_LV3
        elif count >= self.PAUSE_COUNT_LV2:
            self.pause_time = self.PAUSE_TIME_LV2
        else:
            self.pause_time = self.PAUSE_TIME_LV1

        # 开始休眠
        time.sleep(self.pause_time)

    def close(self):
        self.queue.put_nowait('@@##$$')
        self.log.info('发送线程退出指令...')

    def push_msg(self, msg):
        if self.is_open:
            self.queue.put_nowait(str(msg))

    def run(self):
        self.log.info('开始运行消息队列...')
        while True:
            # 判断是否打开了消息队列
            if not self.is_open:
                self.log.info('没有打开消息队列, 退出!')
                break

            try:
                msg = self.queue.get()
                if msg == '@@##$$':
                    break

                while True:
                    try:
                        self.beanstalk.put(self.output_tube, msg)

                        # 发送前先判断是否需要休眠
                        # self.is_need_pause()

                        # 设置消息队列连接状态
                        self.is_connect = True
                        break
                    except SocketError as e:
                        # 设置当前消息队列已中断, 减缓发送数据速度
                        self.is_connect = False
                        time.sleep(10)
                        self.beanstalk.reconnect()
                        self.log.warn("reconnect beanstalk...")
                        self.log.exception(e)
                    except Exception as e:
                        self.is_connect = False
                        self.log.error('捕获异常休眠...')
                        self.log.exception(e)
                        time.sleep(10)
            except Exception as e:
                self.log.info('当前队列大小: size = {size}'.format(size=self.queue.qsize()))
                self.log.exception(e)
                time.sleep(5)

        self.log.info('消息队列线程正常退出.')
Ejemplo n.º 12
0
class StartTaskCrawler(object):

    # 最大运行时间
    MAX_RUN_TIME = 12 * 60 * 60

    def __init__(self, config_file='config/cmb_gsxt.conf', province=None):
        self.worker_list = {}
        self.config_list = {}
        self.thread_num = 8
        self.province = province
        self.pool = None
        self.beanstalk_consumer_conf = beanstalk_consumer_conf
        self.crawl_flag = 'crawl_online'
        self.source_table = 'online_all_list'
        self.tube = ''

        # 不指定抓取的站点直接抛异常
        if province is None or province == '':
            raise StandardError('province error...')

        # 没有指定配置文件直接抛异常
        if config_file is None or config_file == '':
            raise StandardError('province error...')

        # 加载配置
        self.load_config(config_file)

        # 日志信息
        self.log = global_log

        # 开启beanstalk
        self.beanstalk = PyBeanstalk(self.beanstalk_consumer_conf['host'], self.beanstalk_consumer_conf['port'])

        # 连接mongodb
        self.source_db = source_db

        # 初始化worker
        self.init_worker(self.config_list)

    def init_worker(self, config_list):
        for key, value in config_list.iteritems():
            self.worker_list[key] = create_crawl_object(value, key)
            self.log.info('初始化 {key} 完成..'.format(key=key))

    def load_config(self, config_file):

        # 读取配置信息
        conf_parse = ConfigParser(config_file)

        # 加载单独省份信息
        config_dict = conf_parse.get_session(self.province)
        if config_dict is None:
            raise StandardError('站点信息错误...{province}'.format(province=self.province))

        # 更改线程数目
        if config_dict.get('thread_num', None) is not None:
            self.thread_num = int(config_dict['thread_num'])

        # 改变种子表指向
        if config_dict.get('source_table', None) is not None:
            self.source_table = config_dict['source_table']
        else:
            raise StandardError('没有指定原始种子表: source_table')

        # 获得beanstalk配置信息
        config = eval(config_dict.get('beanstalk_consumer_conf', 'None'))
        if config is not None:
            self.beanstalk_consumer_conf = config

        # 标志位
        crawl_flag = config_dict.get('crawl_flag', 'crawl_online')
        if crawl_flag is not None:
            self.crawl_flag = crawl_flag

        # 标志位
        consumer_tube = config_dict.get('consumer_tube', '')
        if consumer_tube is not None and consumer_tube != '':
            self.tube = consumer_tube
        else:
            raise StandardError('没有tube!!!')

        # 添加到配置列表
        self.config_list[self.province] = config_dict

    def task_run(self):

        result_list = []

        # 创建协程池
        if not is_debug:
            self.pool = gevent.pool.Pool(self.thread_num)
        else:
            self.pool = ThreadPool(processes=self.thread_num)

        self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num))
        self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province))
        # 创建线程池
        count = 0
        start_run_time = time.time()
        while True:

            if not is_running:
                break

            job = self.beanstalk.reserve(self.tube, 3)
            if job is not None:
                count += 1
                body = job.body
                job.delete()
                self.log.info('当前消费数据索引: {count}'.format(count=count))
                json_data = util.json_loads(body)
                if json_data is None:
                    self.log.error('数据格式错误: msg = {msg}'.format(msg=body))
                    time.sleep(5)
                    continue

                province = json_data.get('province')
                if province is None or province == '':
                    self.log.error('没有province: {msg}'.format(msg=body))
                    continue

                company_name = json_data.get('company_name')
                unified_social_credit_code = json_data.get('unified_social_credit_code')
                start_schedule_time = json_data.get('start_schedule_time', '')
                if company_name is None and unified_social_credit_code is None:
                    self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body))
                    continue

                if company_name is not None and company_name == '':
                    self.log.error('company = 空字符串, data = {data}'.format(
                        data=body))
                    continue

                if unified_social_credit_code is not None and unified_social_credit_code == '':
                    self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format(
                        data=body))
                    continue

                if province != self.province:
                    self.log.warn('province 不正确: province = {province} data = {body}'.format(
                        province=self.province, body=body))
                    continue

                if company_name is not None:
                    self.log.info('当前消费数据为: province = {province} company = {company}'.format(
                        province=province, company=company_name))
                elif unified_social_credit_code is not None:
                    self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format(
                        province=province, code=unified_social_credit_code))

                # 优先使用企业名单
                if company_name is not None:
                    data = {
                        '_id': util.generator_id({}, company_name, province),
                        'company_name': company_name,
                        'province': province,
                        'in_time': util.get_now_time(),
                        'start_schedule_time': start_schedule_time,
                    }
                else:
                    data = {
                        '_id': util.generator_id({}, unified_social_credit_code, province),
                        'unified_social_credit_code': unified_social_credit_code.strip().upper(),
                        'province': province,
                        'in_time': util.get_now_time(),
                        'start_schedule_time': start_schedule_time,
                    }

                pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task,
                                                    args=(data,))

                result_list.append(pool_result)
                if len(result_list) >= 1000:
                    for result in result_list:
                        result.get()
                    del result_list[:]

            # 如果达到最大运行时间 则重启服务
            run_time = time.time()
            if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME:
                break

        if is_debug:
            self.pool.close()
        self.pool.join()

        for result in result_list:
            result.get()
        del result_list[:]
        del result_list

        self.log.info('收到退出信号, 安全退出...')

    def start_worker(self):
        start_time = time.time()

        try:
            self.task_run()
        except Exception as e:
            self.log.error('周期任务异常!!!!')
            self.log.exception(e)
            exit(1)

        end_time = time.time()
        self.log.info('扫描起始时间: {st}'.format(st=start_time))
        self.log.info('扫描结束时间: {et}'.format(et=end_time))
        self.log.info('扫描消耗时间: {t}s'.format(t=end_time - start_time))