def main():
    log.info('开始读取数据...')
    source_table = 'offline_all_list'
    app_data_table = 'enterprise_data_gov'
    with open("company_expection_list") as p_file:
        result_list = list()
        for line in p_file:
            company_name = line.strip().strip("\r").strip("\n")

            province = 'gsxt'

            item = app_data_db.find_one(app_data_table, {'company': company_name})
            if item is not None and 'province' in item and item['province'] in province_zh_to_py:
                province = province_zh_to_py[item['province']]
            else:
                log.error("省份查找失败: {}".format(company_name))

            data = {
                '_id': util.generator_id({}, company_name, province),
                'company_name': company_name,
                'province': province,
                'in_time': util.get_now_time(),
            }

            result_list.append(data)
            if len(result_list) >= 1000:
                source_db.insert_batch_data(source_table, result_list)
                del result_list[:]

        source_db.insert_batch_data(source_table, result_list)

    log.info('数据发送完毕, 退出程序')
def main():
    count = 0
    log.info('开始读取数据...')
    source_table = 'offline_all_list'
    with open("guangdong.txt") as p_file:
        result_list = list()
        province = "guangdong"
        for line in p_file:
            company_name = line.strip().strip("\r").strip("\n")

            count += 1

            data = {
                '_id': util.generator_id({}, company_name, province),
                'company_name': company_name,
                'province': province,
                'in_time': util.get_now_time(),
                'crawl_online': 0,
            }

            result_list.append(data)
            if len(result_list) >= 1000:
                source_db.insert_batch_data(source_table, result_list)
                del result_list[:]

        source_db.insert_batch_data(source_table, result_list)

    log.info("总共发送数据: {}".format(count))
    log.info('数据发送完毕, 退出程序')
def main(search_name, province, unified_social_credit_code, param):
    item = {
        "_id":
        "9c9d8f8b848514f240f54a40b0a0c6f02622b3d87d54d353e525ca58d9dbe312",
        "province": province,
        "crawl_online": 0,
        "error_times": 0,
        "search_name": search_name,
        "rank": 1,
        "priority": 1,
        "in_time": "2017-04-25 01:42:30",
        "param": param,
        "crawl_online_time": "2017-02-28 04:35:18",
        "company_name": search_name,
        "unified_social_credit_code": unified_social_credit_code
    }

    item["_id"] = util.generator_id({'priority': 1},
                                    unified_social_credit_code,
                                    item["province"])

    source_db.insert_batch_data("online_all_search", [item])
    sleep(2)

    tube = 'gs_{province}_scheduler'.format(province=item['province'])

    data = {
        'unified_social_credit_code': item['unified_social_credit_code'],
        'province': item['province'],
    }
    data_str = json.dumps(data)
    print data_str
    beanstalk.put(tube, data_str)
Beispiel #4
0
def main():
    try:
        count = 0
        all_count = 0
        log.info('开始读取数据...')
        source_table = 'offline_all_list'
        cursor = db_query['offline_crawl_data_registration_company'].find({})
        result_list = list()
        for element in cursor:
            try:
                all_count += 1
                company_name = element['company']
                if ' ' in company_name:
                    query_result = db_query_company_data[
                        'offline_all_list'].find_one(
                            {'company_name': company_name})
                    if query_result:
                        db_query_company_data['offline_all_list'].delete_one(
                            {'company_name': company_name})
                else:
                    continue
                province = province_zh_to_py.get(str(element['province']), '')
                if province == '':
                    log.info("province error: {}".format(element['_id']))
                    continue
                count += 1
                company_name = str(company_name).strip()
                data = {
                    '_id': util.generator_id({}, company_name, province),
                    'company_name': str(company_name).strip(),
                    'province': province,
                    'in_time': util.get_now_time(),
                    'crawl_online': 0,
                }
                mod_value = {'deal_flag': 1}
                db_query['offline_crawl_data_registration_company'].update(
                    {'_id': element['_id']}, {"$set": mod_value})
                log.info("已处理数据量: {}".format(all_count))
                result_list.append(data)
                if len(result_list) >= 100:
                    source_db.insert_batch_data(source_table, result_list)
                    del result_list[:]
            except Exception, e:
                print e.message
        source_db.insert_batch_data(source_table, result_list)
        log.info("总共发送数据: {}".format(count))
        log.info('数据发送完毕, 退出程序')
        time.sleep(100)
Beispiel #5
0
def main():
    try:
        count = 0
        log.info('开始读取数据...')
        source_table = 'offline_all_list'
        cursor = db_query['guangdong_baseinfo_0912'].find({})
        result_list = list()
        for element in cursor:
            try:
                company_name = element['_id']
                if '公司' not in company_name:
                    continue
                query = db_query_app_data['enterprise_data_gov'].find_one(
                    {'company': company_name})
                if query:
                    continue
                query = source_db.find_one('offline_all_list',
                                           {'company_name': company_name})
                if query:
                    continue
                province = 'guangdong'
                count += 1
                data = {
                    '_id': util.generator_id({}, company_name, province),
                    'company_name': company_name,
                    'province': province,
                    'in_time': util.get_now_time(),
                    'crawl_online': 0,
                }
                mod_value = {'deal_flag': 1}
                db_query['guangdong_baseinfo_0912'].update(
                    {'_id': element['_id']}, {"$set": mod_value})
                result_list.append(data)
                log.info("当前总计发送数据: {}".format(count))
                if len(result_list) >= 100:
                    source_db.insert_batch_data(source_table, result_list)
                    del result_list[:]
            except Exception, E:
                print E.message
        source_db.insert_batch_data(source_table, result_list)
        log.info("总共发送数据: {}".format(count))
        log.info('数据发送完毕, 退出程序')
        time.sleep(100)
def main():
    try:
        count = 0
        log.info('开始读取数据...')
        source_table = 'offline_all_list'
        cursor = db_query_app_data['enterprise_data_gov'].find({})
        result_list = list()
        for element in cursor:
            try:
                company_name = element['company']
                query = source_db.find_one('offline_all_list',
                                           {'company_name': company_name})
                if query:
                    continue
                if '公司' not in company_name and 'enterprise_type' not in element[
                        'enterprise_type']:
                    continue
                province = province_zh_to_py.get(str(element['province']), '')
                if province == '':
                    log.info("province error: {}".format(element['_id']))
                    continue
                count += 1
                data = {
                    '_id': util.generator_id({}, company_name, province),
                    'company_name': company_name,
                    'province': province,
                    'in_time': util.get_now_time(),
                    'crawl_online': 0,
                }
                result_list.append(data)
                log.info("当前总计发送数据: {}".format(count))
                if len(result_list) >= 100:
                    source_db.insert_batch_data(source_table, result_list)
                    del result_list[:]
            except Exception, E:
                print E.message
        source_db.insert_batch_data(source_table, result_list)
        log.info("总共发送数据: {}".format(count))
        log.info('数据发送完毕, 退出程序')
        time.sleep(100)
def main():
    source_table = 'cs2_online_all_search'
    target_table = 'online_all_search'
    count = 0
    result_list = []
    append_total = 0
    for item in source_db.traverse_batch(source_table):
        count += 1
        priority = item.get('priority', None)
        search_name = item.get('search_name', None)
        province = item.get('province', None)
        _id = item.get('_id', None)
        log.info('当前运行位置: count = {count}'.format(count=count))
        if _id is None:
            log.error('没有_id信息: item = {item}'.format(item=item))
            continue

        # 清理掉没有这些关键信息的链接
        if search_name is None or priority is None or search_name is None:
            source_db.db[source_table].remove(_id)
            log.info('清理字段不全链接信息: _id = {_id}'.format(_id=_id))
            continue

        cal_id = util.generator_id({'priority': priority}, search_name,
                                   province)
        if _id != cal_id:
            item['_id'] = cal_id

        result_list.append(item)
        append_total += 1
        if append_total >= 1000:
            source_db.insert_batch_data(target_table, result_list)
            append_total = 0
            del result_list[:]

    if len(result_list) > 0:
        source_db.insert_batch_data(target_table, result_list)

    log.info('完成清理, 退出程序!')
Beispiel #8
0
def main():
    while True:
        try:
            count = 0
            log.info('开始读取数据...')
            source_table = 'offline_all_list'
            cursor = db_query['offline_crawl_data_registration_company'].find(
                {'deal_flag': 0})
            result_list = list()
            for element in cursor:
                company_name = element['company']
                province = province_zh_to_py.get(str(element['province']), '')
                if province == '':
                    log.info("province error: {}".format(element['_id']))
                    continue
                count += 1
                data = {
                    '_id': util.generator_id({}, company_name, province),
                    'company_name': company_name,
                    'province': province,
                    'in_time': util.get_now_time(),
                    'crawl_online': 0,
                }
                mod_value = {'deal_flag': 1}
                db_query['offline_crawl_data_registration_company'].update(
                    {'_id': element['_id']}, {"$set": mod_value})

                result_list.append(data)
                if len(result_list) >= 100:
                    source_db.insert_batch_data(source_table, result_list)
                    del result_list[:]
            source_db.insert_batch_data(source_table, result_list)
            log.info("总共发送数据: {}".format(count))
            log.info('数据发送完毕, 退出程序')
            time.sleep(100)
        except Exception, e:
            log.info(e.message)
            time.sleep(10)
    def report_crawl_fail(self, item):
        _id = item.get('_id')
        search_name = item.get('search_name')
        if search_name is None:
            search_name = _id
            self.log.info('search_name is None: _id = {_id}'.format(_id=_id))

        # 判断是否由权限反馈到搜索列表
        if (self.report_status & self.REPORT_SEARCH) > 0:
            result_item = self.company_data_db.find_one(self.online_all_search,
                                                        {'search_name': search_name, 'province': self.province})
            if result_item is not None:
                result_item[self.crawl_flag] = 0
                self.company_data_db.save(self.online_all_search, result_item)
                self.log.info('save online_all_search success {com}'.format(com=search_name))
                return

        # 判断是否由权限反馈到种子列表
        if (self.report_status & self.REPORT_SEED) > 0:
            result_item = self.company_data_db.find_one(self.offline_all_list,
                                                        {'company_name': _id, 'province': self.province})
            if result_item is not None:
                result_item[self.crawl_flag] = 0
                self.company_data_db.save(self.offline_all_list, result_item)
                self.log.info('save offline_all_list success {com}'.format(com=_id))
                return

            data = {
                '_id': util.generator_id({}, _id, self.province),
                'company_name': _id,
                'province': self.province,
                'in_time': util.get_now_time(),
                self.crawl_flag: 0,
            }
            self.company_data_db.insert_batch_data(self.offline_all_list, [data])
            self.log.info('insert new company = {company}'.format(company=_id))
    def save_search_list(self, company, code, param_list):

        match_param = None
        if self.search_table is None:
            return param_list, match_param

        rank = 1
        data_list = []
        for param in param_list:

            # 必须要有列表名 才进行存储
            search_name = param.get('search_name')
            if search_name is None:
                continue

            # 取得解析出的统一社会信用号代码信息
            unified_social_credit_code = param.get(
                'unified_social_credit_code')

            # 不在参数中存储统一社会信用号
            if 'unified_social_credit_code' in param:
                unified_social_credit_code = unified_social_credit_code.strip(
                ).upper()
                param.pop('unified_social_credit_code')

            if company is not None:
                replace_name_1 = company.replace('(', '(').replace(')', ')')
                replace_name_2 = company.replace('(', '(').replace(')', ')')
            else:
                replace_name_1 = ''
                replace_name_2 = ''

            # 确定优先级, 如果种子名称跟列表名称一样 则优先级最高为 0
            if search_name == company \
                    or search_name == replace_name_1 \
                    or search_name == replace_name_2 \
                    or (code == unified_social_credit_code and code is not None):
                priority = 0
            else:
                priority = 1

            data = {
                # 以搜索列表名与省份信息作为唯一主键
                '_id':
                util.generator_id({'priority': priority}, search_name,
                                  self.province),
                'search_name':
                search_name,
                'province':
                self.province,
                'in_time':
                util.get_now_time(),
                'param':
                param,
                'rank':
                rank,
                'priority':
                priority,
                self.ERROR_TIMES:
                0,
            }

            # 加入注册码
            if unified_social_credit_code is not None:
                data['unified_social_credit_code'] = unified_social_credit_code

            # 添加搜索种子信息
            if company is not None:
                data['company_name'] = company
            if code is not None:
                data['seed_code'] = code

            # 如果是完全匹配则重置抓取状态信息
            if priority == 0:
                data[self.crawl_flag] = 0
                match_param = param.copy()

            data_list.append(data)
            rank += 1

        # 调试模式下不实际插入数据
        #if not is_debug:
        self.source_db.insert_batch_data(self.search_table, data_list)

        return param_list, match_param
def main():
    log.info('开始读取数据...')
    source_table = 'zhuxiao_diaoxiao_company'
    target_table = 'offline_all_list'
    source_table_curse = source_db.db[source_table].find({}, ['_id', 'province', 'registered_code',
                                                              'unified_social_credit_code'],
                                                         no_cursor_timeout=True).batch_size(10000)
    cnt = 0
    insert_list = []
    count = 0
    real_insert_cnt = 0
    for item in source_table_curse:
        count += 1
        company_name = item.get('_id')
        if company_name is None:
            continue

        province = item.get('province')
        # if province is not None:
        #     log.info('province = {province}'.format(province=province))

        registered_code = item.get('registered_code')
        # if registered_code is not None:
        #     log.info('registered_code = {registered_code}'.format(registered_code=registered_code))

        unified_social_credit_code = item.get('unified_social_credit_code')
        # if unified_social_credit_code is not None:
        #     log.info('unified_social_credit_code = {unified_social_credit_code}'.format(
        #         unified_social_credit_code=unified_social_credit_code))

        province = get_province(province, registered_code, unified_social_credit_code)
        if province is None:
            log.error('计算省份信息失败: company = {company}'.format(company=company_name))
            continue

        data = {
            '_id': util.generator_id({}, company_name, province),
            'company_name': company_name,
            'province': province,
            'in_time': util.get_now_time(),
        }
        insert_list.append(data)
        cnt += 1
        real_insert_cnt += 1
        if cnt >= 10000:
            target_db.insert_batch_data(target_table, insert_list, insert=True)
            cnt = 0
            del insert_list[:]
            log.info('insert 10000')

        log.info('当前进度: count = {count} company = {company}'.format(
            count=count, company=company_name))

    if len(insert_list) > 0:
        target_db.insert_batch_data(target_table, insert_list, insert=True)
        log.info('insert last data')

    source_table_curse.close()

    log.info('总共插入数据为: {cnt}'.format(cnt=real_insert_cnt))
    log.info('数据发送完毕, 退出程序')
    def task_run(self):

        result_list = []

        # 创建协程池
        if not is_debug:
            self.pool = gevent.pool.Pool(self.thread_num)
        else:
            self.pool = ThreadPool(processes=self.thread_num)

        self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num))
        self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province))
        # 创建线程池
        count = 0
        start_run_time = time.time()
        while True:

            if not is_running:
                break

            job = self.beanstalk.reserve(self.tube, 3)
            if job is not None:
                count += 1
                body = job.body
                job.delete()
                self.log.info('当前消费数据索引: {count}'.format(count=count))
                json_data = util.json_loads(body)
                if json_data is None:
                    self.log.error('数据格式错误: msg = {msg}'.format(msg=body))
                    time.sleep(5)
                    continue

                province = json_data.get('province')
                if province is None or province == '':
                    self.log.error('没有province: {msg}'.format(msg=body))
                    continue

                company_name = json_data.get('company_name')
                unified_social_credit_code = json_data.get('unified_social_credit_code')
                start_schedule_time = json_data.get('start_schedule_time', '')
                if company_name is None and unified_social_credit_code is None:
                    self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body))
                    continue

                if company_name is not None and company_name == '':
                    self.log.error('company = 空字符串, data = {data}'.format(
                        data=body))
                    continue

                if unified_social_credit_code is not None and unified_social_credit_code == '':
                    self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format(
                        data=body))
                    continue

                if province != self.province:
                    self.log.warn('province 不正确: province = {province} data = {body}'.format(
                        province=self.province, body=body))
                    continue

                if company_name is not None:
                    self.log.info('当前消费数据为: province = {province} company = {company}'.format(
                        province=province, company=company_name))
                elif unified_social_credit_code is not None:
                    self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format(
                        province=province, code=unified_social_credit_code))

                # 优先使用企业名单
                if company_name is not None:
                    data = {
                        '_id': util.generator_id({}, company_name, province),
                        'company_name': company_name,
                        'province': province,
                        'in_time': util.get_now_time(),
                        'start_schedule_time': start_schedule_time,
                    }
                else:
                    data = {
                        '_id': util.generator_id({}, unified_social_credit_code, province),
                        'unified_social_credit_code': unified_social_credit_code.strip().upper(),
                        'province': province,
                        'in_time': util.get_now_time(),
                        'start_schedule_time': start_schedule_time,
                    }

                pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task,
                                                    args=(data,))

                result_list.append(pool_result)
                if len(result_list) >= 1000:
                    for result in result_list:
                        result.get()
                    del result_list[:]

            # 如果达到最大运行时间 则重启服务
            run_time = time.time()
            if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME:
                break

        if is_debug:
            self.pool.close()
        self.pool.join()

        for result in result_list:
            result.get()
        del result_list[:]
        del result_list

        self.log.info('收到退出信号, 安全退出...')