def main(): log.info('开始读取数据...') source_table = 'offline_all_list' app_data_table = 'enterprise_data_gov' with open("company_expection_list") as p_file: result_list = list() for line in p_file: company_name = line.strip().strip("\r").strip("\n") province = 'gsxt' item = app_data_db.find_one(app_data_table, {'company': company_name}) if item is not None and 'province' in item and item['province'] in province_zh_to_py: province = province_zh_to_py[item['province']] else: log.error("省份查找失败: {}".format(company_name)) data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), } result_list.append(data) if len(result_list) >= 1000: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info('数据发送完毕, 退出程序')
def main(): count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' with open("guangdong.txt") as p_file: result_list = list() province = "guangdong" for line in p_file: company_name = line.strip().strip("\r").strip("\n") count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } result_list.append(data) if len(result_list) >= 1000: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序')
def main(search_name, province, unified_social_credit_code, param): item = { "_id": "9c9d8f8b848514f240f54a40b0a0c6f02622b3d87d54d353e525ca58d9dbe312", "province": province, "crawl_online": 0, "error_times": 0, "search_name": search_name, "rank": 1, "priority": 1, "in_time": "2017-04-25 01:42:30", "param": param, "crawl_online_time": "2017-02-28 04:35:18", "company_name": search_name, "unified_social_credit_code": unified_social_credit_code } item["_id"] = util.generator_id({'priority': 1}, unified_social_credit_code, item["province"]) source_db.insert_batch_data("online_all_search", [item]) sleep(2) tube = 'gs_{province}_scheduler'.format(province=item['province']) data = { 'unified_social_credit_code': item['unified_social_credit_code'], 'province': item['province'], } data_str = json.dumps(data) print data_str beanstalk.put(tube, data_str)
def main(): try: count = 0 all_count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['offline_crawl_data_registration_company'].find({}) result_list = list() for element in cursor: try: all_count += 1 company_name = element['company'] if ' ' in company_name: query_result = db_query_company_data[ 'offline_all_list'].find_one( {'company_name': company_name}) if query_result: db_query_company_data['offline_all_list'].delete_one( {'company_name': company_name}) else: continue province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 company_name = str(company_name).strip() data = { '_id': util.generator_id({}, company_name, province), 'company_name': str(company_name).strip(), 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['offline_crawl_data_registration_company'].update( {'_id': element['_id']}, {"$set": mod_value}) log.info("已处理数据量: {}".format(all_count)) result_list.append(data) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, e: print e.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def main(): try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['guangdong_baseinfo_0912'].find({}) result_list = list() for element in cursor: try: company_name = element['_id'] if '公司' not in company_name: continue query = db_query_app_data['enterprise_data_gov'].find_one( {'company': company_name}) if query: continue query = source_db.find_one('offline_all_list', {'company_name': company_name}) if query: continue province = 'guangdong' count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['guangdong_baseinfo_0912'].update( {'_id': element['_id']}, {"$set": mod_value}) result_list.append(data) log.info("当前总计发送数据: {}".format(count)) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, E: print E.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def main(): try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query_app_data['enterprise_data_gov'].find({}) result_list = list() for element in cursor: try: company_name = element['company'] query = source_db.find_one('offline_all_list', {'company_name': company_name}) if query: continue if '公司' not in company_name and 'enterprise_type' not in element[ 'enterprise_type']: continue province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } result_list.append(data) log.info("当前总计发送数据: {}".format(count)) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] except Exception, E: print E.message source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100)
def main(): source_table = 'cs2_online_all_search' target_table = 'online_all_search' count = 0 result_list = [] append_total = 0 for item in source_db.traverse_batch(source_table): count += 1 priority = item.get('priority', None) search_name = item.get('search_name', None) province = item.get('province', None) _id = item.get('_id', None) log.info('当前运行位置: count = {count}'.format(count=count)) if _id is None: log.error('没有_id信息: item = {item}'.format(item=item)) continue # 清理掉没有这些关键信息的链接 if search_name is None or priority is None or search_name is None: source_db.db[source_table].remove(_id) log.info('清理字段不全链接信息: _id = {_id}'.format(_id=_id)) continue cal_id = util.generator_id({'priority': priority}, search_name, province) if _id != cal_id: item['_id'] = cal_id result_list.append(item) append_total += 1 if append_total >= 1000: source_db.insert_batch_data(target_table, result_list) append_total = 0 del result_list[:] if len(result_list) > 0: source_db.insert_batch_data(target_table, result_list) log.info('完成清理, 退出程序!')
def main(): while True: try: count = 0 log.info('开始读取数据...') source_table = 'offline_all_list' cursor = db_query['offline_crawl_data_registration_company'].find( {'deal_flag': 0}) result_list = list() for element in cursor: company_name = element['company'] province = province_zh_to_py.get(str(element['province']), '') if province == '': log.info("province error: {}".format(element['_id'])) continue count += 1 data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'crawl_online': 0, } mod_value = {'deal_flag': 1} db_query['offline_crawl_data_registration_company'].update( {'_id': element['_id']}, {"$set": mod_value}) result_list.append(data) if len(result_list) >= 100: source_db.insert_batch_data(source_table, result_list) del result_list[:] source_db.insert_batch_data(source_table, result_list) log.info("总共发送数据: {}".format(count)) log.info('数据发送完毕, 退出程序') time.sleep(100) except Exception, e: log.info(e.message) time.sleep(10)
def report_crawl_fail(self, item): _id = item.get('_id') search_name = item.get('search_name') if search_name is None: search_name = _id self.log.info('search_name is None: _id = {_id}'.format(_id=_id)) # 判断是否由权限反馈到搜索列表 if (self.report_status & self.REPORT_SEARCH) > 0: result_item = self.company_data_db.find_one(self.online_all_search, {'search_name': search_name, 'province': self.province}) if result_item is not None: result_item[self.crawl_flag] = 0 self.company_data_db.save(self.online_all_search, result_item) self.log.info('save online_all_search success {com}'.format(com=search_name)) return # 判断是否由权限反馈到种子列表 if (self.report_status & self.REPORT_SEED) > 0: result_item = self.company_data_db.find_one(self.offline_all_list, {'company_name': _id, 'province': self.province}) if result_item is not None: result_item[self.crawl_flag] = 0 self.company_data_db.save(self.offline_all_list, result_item) self.log.info('save offline_all_list success {com}'.format(com=_id)) return data = { '_id': util.generator_id({}, _id, self.province), 'company_name': _id, 'province': self.province, 'in_time': util.get_now_time(), self.crawl_flag: 0, } self.company_data_db.insert_batch_data(self.offline_all_list, [data]) self.log.info('insert new company = {company}'.format(company=_id))
def save_search_list(self, company, code, param_list): match_param = None if self.search_table is None: return param_list, match_param rank = 1 data_list = [] for param in param_list: # 必须要有列表名 才进行存储 search_name = param.get('search_name') if search_name is None: continue # 取得解析出的统一社会信用号代码信息 unified_social_credit_code = param.get( 'unified_social_credit_code') # 不在参数中存储统一社会信用号 if 'unified_social_credit_code' in param: unified_social_credit_code = unified_social_credit_code.strip( ).upper() param.pop('unified_social_credit_code') if company is not None: replace_name_1 = company.replace('(', '(').replace(')', ')') replace_name_2 = company.replace('(', '(').replace(')', ')') else: replace_name_1 = '' replace_name_2 = '' # 确定优先级, 如果种子名称跟列表名称一样 则优先级最高为 0 if search_name == company \ or search_name == replace_name_1 \ or search_name == replace_name_2 \ or (code == unified_social_credit_code and code is not None): priority = 0 else: priority = 1 data = { # 以搜索列表名与省份信息作为唯一主键 '_id': util.generator_id({'priority': priority}, search_name, self.province), 'search_name': search_name, 'province': self.province, 'in_time': util.get_now_time(), 'param': param, 'rank': rank, 'priority': priority, self.ERROR_TIMES: 0, } # 加入注册码 if unified_social_credit_code is not None: data['unified_social_credit_code'] = unified_social_credit_code # 添加搜索种子信息 if company is not None: data['company_name'] = company if code is not None: data['seed_code'] = code # 如果是完全匹配则重置抓取状态信息 if priority == 0: data[self.crawl_flag] = 0 match_param = param.copy() data_list.append(data) rank += 1 # 调试模式下不实际插入数据 #if not is_debug: self.source_db.insert_batch_data(self.search_table, data_list) return param_list, match_param
def main(): log.info('开始读取数据...') source_table = 'zhuxiao_diaoxiao_company' target_table = 'offline_all_list' source_table_curse = source_db.db[source_table].find({}, ['_id', 'province', 'registered_code', 'unified_social_credit_code'], no_cursor_timeout=True).batch_size(10000) cnt = 0 insert_list = [] count = 0 real_insert_cnt = 0 for item in source_table_curse: count += 1 company_name = item.get('_id') if company_name is None: continue province = item.get('province') # if province is not None: # log.info('province = {province}'.format(province=province)) registered_code = item.get('registered_code') # if registered_code is not None: # log.info('registered_code = {registered_code}'.format(registered_code=registered_code)) unified_social_credit_code = item.get('unified_social_credit_code') # if unified_social_credit_code is not None: # log.info('unified_social_credit_code = {unified_social_credit_code}'.format( # unified_social_credit_code=unified_social_credit_code)) province = get_province(province, registered_code, unified_social_credit_code) if province is None: log.error('计算省份信息失败: company = {company}'.format(company=company_name)) continue data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), } insert_list.append(data) cnt += 1 real_insert_cnt += 1 if cnt >= 10000: target_db.insert_batch_data(target_table, insert_list, insert=True) cnt = 0 del insert_list[:] log.info('insert 10000') log.info('当前进度: count = {count} company = {company}'.format( count=count, company=company_name)) if len(insert_list) > 0: target_db.insert_batch_data(target_table, insert_list, insert=True) log.info('insert last data') source_table_curse.close() log.info('总共插入数据为: {cnt}'.format(cnt=real_insert_cnt)) log.info('数据发送完毕, 退出程序')
def task_run(self): result_list = [] # 创建协程池 if not is_debug: self.pool = gevent.pool.Pool(self.thread_num) else: self.pool = ThreadPool(processes=self.thread_num) self.log.info('当前开启协程数目: thread_num = {num}'.format(num=self.thread_num)) self.log.info('province: {province}服务已开启, 等待消费数据'.format(province=self.province)) # 创建线程池 count = 0 start_run_time = time.time() while True: if not is_running: break job = self.beanstalk.reserve(self.tube, 3) if job is not None: count += 1 body = job.body job.delete() self.log.info('当前消费数据索引: {count}'.format(count=count)) json_data = util.json_loads(body) if json_data is None: self.log.error('数据格式错误: msg = {msg}'.format(msg=body)) time.sleep(5) continue province = json_data.get('province') if province is None or province == '': self.log.error('没有province: {msg}'.format(msg=body)) continue company_name = json_data.get('company_name') unified_social_credit_code = json_data.get('unified_social_credit_code') start_schedule_time = json_data.get('start_schedule_time', '') if company_name is None and unified_social_credit_code is None: self.log.error('没有company 与 unified_social_credit_code: {msg}'.format(msg=body)) continue if company_name is not None and company_name == '': self.log.error('company = 空字符串, data = {data}'.format( data=body)) continue if unified_social_credit_code is not None and unified_social_credit_code == '': self.log.error('unified_social_credit_code = 空字符串, data = {data}'.format( data=body)) continue if province != self.province: self.log.warn('province 不正确: province = {province} data = {body}'.format( province=self.province, body=body)) continue if company_name is not None: self.log.info('当前消费数据为: province = {province} company = {company}'.format( province=province, company=company_name)) elif unified_social_credit_code is not None: self.log.info('当前消费数据为: province = {province} unified_social_credit_code = {code}'.format( province=province, code=unified_social_credit_code)) # 优先使用企业名单 if company_name is not None: data = { '_id': util.generator_id({}, company_name, province), 'company_name': company_name, 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } else: data = { '_id': util.generator_id({}, unified_social_credit_code, province), 'unified_social_credit_code': unified_social_credit_code.strip().upper(), 'province': province, 'in_time': util.get_now_time(), 'start_schedule_time': start_schedule_time, } pool_result = self.pool.apply_async(self.worker_list[self.province].query_online_task, args=(data,)) result_list.append(pool_result) if len(result_list) >= 1000: for result in result_list: result.get() del result_list[:] # 如果达到最大运行时间 则重启服务 run_time = time.time() if int(run_time) - int(start_run_time) >= self.MAX_RUN_TIME: break if is_debug: self.pool.close() self.pool.join() for result in result_list: result.get() del result_list[:] del result_list self.log.info('收到退出信号, 安全退出...')