def run(self): # 创建 MongoDB 对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 遍历 ENTITY_CODE_ 列表 # self.code_list = self.code_list[14:] for entity_code in self.code_list: self.count = 0 hash_list = list() status = False mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity_code) if mongo_data_list: self.logger.warning("{} 查取成功".format(entity_code)) self.logger.warning("当前共有{}条".format(mongo_data_list.count())) status = True else: self.logger.warning("{} 无数据".format(entity_code)) if status: for data in mongo_data_list: if "ADDR_" in data: hash_m = hashlib.md5() hash_m.update(data["ADDR_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) else: if "CONTENT_" in data: hash_m = hashlib.md5() hash_m.update(data["CONTENT_"].encode("utf-8")) hash_title = hash_m.hexdigest() if hash_title in hash_list: self.count += 1 else: hash_list.append(hash_title) self.logger.warning("重复数据{}条".format(self.count)) # 关闭连接 m_client.client_close() self.logger.handlers.clear()
def run(self): count = 0 # # 创建 Phoenix 对象-注意表格名字 p_client = PhoenixHbase(table_name="FUND") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="JSFUND_CCBDATA") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) #查询省市区的编码列表 # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA") # province_list, city_list, area_list, dir_area_list = script.area_from_mysql() list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS") list_TYPE = self.dict_from_mysql("FUND_TYPE") # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 基金表创建语句 # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,' # '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,' # '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,' # '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,' # '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,' # '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,' # '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,' # '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,' # '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE) re_data = module_name.data_shuffle( data, list_SUBS_STATUS, list_TYPE) # re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # try: # area_data = self.shuffle_for_area(list_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e)) # continue # except ValueError: # pass # phoenix_HBase 插入数据 if list_data: try: count += 1 print(count) # print(list_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=list_data) # 导出csv # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv") once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # try: # area_data = self.shuffle_for_area(re_data) # except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) # self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e)) # continue # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count if self.success_count % 100 == 0: self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} if self.success_count % 50 == 0: update_count = m_client.update_to_mongodb( collection=collection, data_id=self.remove_id_list, data_dict={"d": 1}) self.remove_count += update_count self.logger.info("MongoDB 更新成功") except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection(db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["FINPRODUCT_FINASSIST"] # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 表创建语句 # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, ' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,' # '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,' # '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,' # '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,' # '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,' # '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,' # '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,' # '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,' # '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]: # for entity in self.entity_list: status = False module_name = __import__(entity) self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity)) self.remove_id_list = [] self.copy_mongo_data_list = [] # find_id = "5c3f118f8d7fee068da6ef53" find_id = None try: if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) if entity == "JSFIN_CCBDATA": m_client.mongo_collection = "JSFIN_CCBDATA" mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client, collection=collection, data_id=None) else: m_client.mongo_collection = "FINPRODUCT_FINASSIST" collection = m_client.get_check_collection(db=db, collection_list=collection_list) mongo_data_list = self.get_data_from_mongo(m_client=m_client, collection=collection, entity_code=entity, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) if entity == "CHINANETFINANCIAL": re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status, produc_category=self.produc_category, revenue_type=self.revenue_type, operaton_pattern=self.operaton_pattern, purchase_amount=self.purchase_amount, duration_type=self.duration_type) elif entity == "JSFIN_CCBDATA": re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data) else: re_data = module_name.data_shuffle(data) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e)) continue print(data_id) # phoenix_HBase 插入数据 if isinstance(re_data, dict): try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue elif isinstance(re_data, list): for r_data in re_data: try: success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=r_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue break # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() # p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
class Statistics(object): def __init__(self, entity_type=None): """ 初始化 :param entity_type: """ t = arrow.now() self.local_time = t.format("YYYY-MM-DD") h_t = t.shift(days=-1) self.hesternal_time = h_t.format("YYYY-MM-DD") self.entity_type = entity_type self.__base_path = os.path.abspath(os.path.dirname(__file__)) self.__dir_path = self.__base_path + "/scripts/{}".format( self.entity_type) self.file_path = self.__base_path + "/statistics/{}".format( self.local_time) self.__type_list = list() self.__file_list = list() if self.entity_type: self.get_entity_code() self.mongo_client = MongoClient() # "hesternal_spider_url_temp": 0, "hesternal_spider_url_fixed": 0, self.name_dict = { "实体编码": "", "待爬数据": 0, "需爬取总量": 0, "现有数据": 0, "昨日爬取数据": 0 } def get_entity_code(self): """ 获取目标目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ ) :return: """ for root, dirs, files in os.walk(self.__dir_path): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__file_list = files self.__file_list.remove("__init_____.py") break def save_to_csv(self, file_path): """ save each count and save to csv :param file_path: :return: """ if os.path.exists(file_path): with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) append_list = list() for key, value in self.name_dict.items(): append_list.append(value) writer.writerows([append_list]) else: try: with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) except FileNotFoundError: os.makedirs(self.file_path) with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) def count_from_database(self): """ count data for database "spider_url_temp", "spider_url_fixed", "spider_data" where entity_code == entity_type :return: """ # test_index = self.__file_list.index("CommonBidding_650500HMSSY.py") # self.__file_list = self.__file_list[test_index:] for entity_code in self.__file_list: entity_code = entity_code.replace("CommonBidding_", "") entity_code = entity_code.replace(".py", "") print(entity_code) self.name_dict["实体编码"] = entity_code # spider_url_temp db = self.mongo_client.client["spider_url_temp"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 # 统计该实体昨天数据 # try: # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # spider_url_fixed db = self.mongo_client.client["spider_url_fixed"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 # # 统计该实体昨天数据 # try: # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # spider_data db = self.mongo_client.client["spider_data"] collection = db[self.entity_type] self.mongo_client.mongo_entity_code = entity_code if len(self.__file_list) == 1: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 else: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 file_path = self.file_path + "/count_for_{}.csv".format( self.entity_type) self.save_to_csv(file_path) self.mongo_client.client_close() def run(self): if self.entity_type: self.count_from_database() else: for root, dirs, files in os.walk(self.__base_path + "/scripts"): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__type_list = dirs self.__type_list.remove("__pycache__") break # 中断 # test_index = self.__type_list.index("NEWS_FINASSIST") # self.__type_list = self.__type_list[test_index:] for _type in self.__type_list: print(_type) self.entity_type = _type self.__dir_path = self.__base_path + "/scripts/{}".format( _type) self.get_entity_code() self.count_from_database()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="CommonBidding") # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="CommonBidding") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MongoDB spider_data_old 数据库对象 old_client = MongoClient(mongo_collection="CommonBidding") # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, # serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) old_client.mongo_db = "spider_data_old" db_old, collection_list_old = old_client.client_to_mongodb() collection_old = db_old["CommonBidding"] # 删除表 # p_client.drop_table_phoenix(connection=connection) # 招投标表创建语句 # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,' # '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,' # '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,' # '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,' # '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,' # '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,' # '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,' # '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,' # '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,' # '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,' # '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,' # '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true') # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 # self.file_list = ["CommonBidding_86JCW"] for f in self.file_list: status = False entity_code = f.replace(".py", "") module_name = __import__(entity_code) entity_code_mongo = entity_code.replace("CommonBidding_", "") self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo)) self.remove_id_list = [] self.copy_mongo_data_list = [] try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 try: self.find_count += mongo_data_list.count() except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) self.find_count += mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] self.remove_id_list.append(data_id) del data["_id"] # 深拷贝源数据,用于插入 spider_data 库中 copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) # 数据清洗 try: re_data = module_name.data_shuffle(data) final_data = self.shuffle_data(re_data) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if final_data: try: p_client.upsert_to_phoenix_by_one( connection=connection, data=final_data) once_count += 1 except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) # 删除数据 if status: delete_count = self.delete_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code_mongo, remove_id_list=self.remove_id_list) self.remove_count += delete_count else: self.logger.info("HBase 插入成功条数0条, 不执行删除") # 将数据插入 spider_data_old 中 if status: try: old_client.mongo_db = "spider_data_old" insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except pymongo.errors.ServerSelectionTimeoutError as e: time.sleep(1) self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) insert_count = old_client.all_to_mongodb( collection=collection_old, insert_list=self.copy_mongo_data_list) self.old_count += insert_count # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) except Exception as e: self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.handlers.clear()
def run(self): # # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="NEWS_FINASSIST") p_client.verify_list = self.verify_list # # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="NEWS_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # # 表创建语句 # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, ' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, ' # '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,' # '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,' # '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)' # 'IMMUTABLE_ROWS = true') # # 创建表 # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) # self.remove_id_list = [] # self.copy_mongo_data_list = [] # self.branch_code_list = [] if entity_code == "CAIJINGNEWS": find_id = "5c6bfa508d7fee512a4ca68f" else: find_id = "" # find_id = "" try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, find_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for i in range(1000000): try: data = mongo_data_list.__next__() except pymongo.errors.ServerSelectionTimeoutError: continue except StopIteration: break # for data in mongo_data_list: data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info( "running on data_id: {}".format(data_id)) # print(data_id) # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) data_list = [data] re_data = module_name.data_shuffle(data_list) if not re_data: self.bad_count += 1 # self.remove_id_list.remove(data_id) continue except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue if isinstance(re_data, list): for list_data in re_data: # phoenix_HBase 插入数据 if list_data: try: if entity_code != "CNINFONEWS": ai_data = self.get_brief_from_ai( data=list_data) else: ai_data = list_data # print(ai_data["CONTENT_"]) except Exception as e: self.logger.info("AI 调取失败, 错误信息", e) ai_data = re_data try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=ai_data) once_count += success_count self.success_count += success_count if self.success_count % 10 == 0: self.logger.info( "HBase 插入成功, 成功条数{}条".format( once_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue try: # 添加 {d:1} update_count = m_client.update_to_mongodb( collection=collection, data_id=data_id, data_dict={"d": 1}) self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if self.remove_count % 10 == 0: self.logger.info( "MongoDB 更新成功, 成功条数 {} 条".format( "10")) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "MongoDB 更新 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): # phoenix_HBase 插入数据 if re_data: try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) once_count += success_count self.success_count += success_count self.logger.info( "HBase 插入成功, 成功条数 {} 条".format( success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.warning( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if once_count > 0: status = True self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format( entity_code, once_count)) mongo_data_list.close() else: continue # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="WEIBOBASICINFO") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="WEIBOBASICINFO") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="WEIBOBASICINFO") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type=None).area_from_mysql()) # 删除表 p_client.drop_table_phoenix(connection=connection) # quit() # 创建表 sql = ( 'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,' '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,' '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,' '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,' '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,' '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,' '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true') p_client.create_new_table_phoenix(connection=connection, sql=sql) # 增加列 # p_client.add_column_phoenix(connection=connection, column="IMAGE_") # 遍历 ENTITY_CODE_ 列表 status = False self.logger.info("开始进行 WEIBOBASICINFO") try: mongo_data_list = m_client.all_from_mongodb(collection=collection) except pymongo.errors.ServerSelectionTimeoutError: time.sleep(1) mongo_data_list = m_client.all_from_mongodb(collection=collection) # 清洗数据并插入 HBase if mongo_data_list: self.find_count = mongo_data_list.count() for data in mongo_data_list: re_data = "" data_id = data["_id"] copy_data = {} self.remove_id_list.append(data_id) try: del data["_id"] copy_data = deepcopy(data) self.copy_mongo_data_list.append(copy_data) re_data = self.data_shuffle(data=data, province_list=province_list) if not re_data: self.bad_count += 1 continue except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("清洗错误,错误 _id 为{}, {}".format( data_id, e)) # phoenix_HBase 插入数据 try: success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=re_data) self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: self.remove_id_list.remove(data_id) self.copy_mongo_data_list.remove(copy_data) self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format( self.success_count)) else: quit() # # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # except pymongo.errors.ServerSelectionTimeoutError as e: # time.sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type="ORGANIZE_FINASSIST").area_from_mysql()) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 创建表 # # 网点表创建语句 # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,' # '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,' # '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,' # '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,' # '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,' # '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,' # '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true') # # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] # find_id = "" if entity_code == "ECITICORGANIZE": find_id = "5c3f48479bb3df1d97d762e1" else: find_id = None try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) re_data = module_name.data_shuffle( data, province_list, city_list, area_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) if isinstance(re_data, list): for list_data in re_data: area_data = "" try: # self.logger.info("_id {}".format(data_id)) area_data = self.shuffle_for_area(list_data) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "_id:{} 获取经纬度失败, {}".format(data_id, e)) continue # except ValueError: # pass # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): area_data = "" try: area_data = self.shuffle_for_area(re_data) except urllib3.exceptions.NewConnectionError as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count % 100 == 0: self.logger.info("HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} # if self.success_count % 50 == 0: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") mongo_data_list.close() # 添加 {d:1} # if self.remove_id_list: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity_code, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()