class Statistics(object): def __init__(self, entity_type=None): """ 初始化 :param entity_type: """ t = arrow.now() self.local_time = t.format("YYYY-MM-DD") h_t = t.shift(days=-1) self.hesternal_time = h_t.format("YYYY-MM-DD") self.entity_type = entity_type self.__base_path = os.path.abspath(os.path.dirname(__file__)) self.__dir_path = self.__base_path + "/scripts/{}".format( self.entity_type) self.file_path = self.__base_path + "/statistics/{}".format( self.local_time) self.__type_list = list() self.__file_list = list() if self.entity_type: self.get_entity_code() self.mongo_client = MongoClient() # "hesternal_spider_url_temp": 0, "hesternal_spider_url_fixed": 0, self.name_dict = { "实体编码": "", "待爬数据": 0, "需爬取总量": 0, "现有数据": 0, "昨日爬取数据": 0 } def get_entity_code(self): """ 获取目标目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ ) :return: """ for root, dirs, files in os.walk(self.__dir_path): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__file_list = files self.__file_list.remove("__init_____.py") break def save_to_csv(self, file_path): """ save each count and save to csv :param file_path: :return: """ if os.path.exists(file_path): with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) append_list = list() for key, value in self.name_dict.items(): append_list.append(value) writer.writerows([append_list]) else: try: with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) except FileNotFoundError: os.makedirs(self.file_path) with open(file_path, "a", newline="", errors="ignore") as f: writer = csv.writer(f) check_list = list() append_list = list() for key, value in self.name_dict.items(): check_list.append(key) append_list.append(value) writer.writerows([check_list]) writer.writerows([append_list]) def count_from_database(self): """ count data for database "spider_url_temp", "spider_url_fixed", "spider_data" where entity_code == entity_type :return: """ # test_index = self.__file_list.index("CommonBidding_650500HMSSY.py") # self.__file_list = self.__file_list[test_index:] for entity_code in self.__file_list: entity_code = entity_code.replace("CommonBidding_", "") entity_code = entity_code.replace(".py", "") print(entity_code) self.name_dict["实体编码"] = entity_code # spider_url_temp db = self.mongo_client.client["spider_url_temp"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_temp = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_temp: self.name_dict["待爬数据"] = mongo_data_list_temp.count() else: self.name_dict["待爬数据"] = 0 # 统计该实体昨天数据 # try: # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if temp_day_ago: # self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count() # else: # self.name_dict["hesternal_spider_url_temp"] = 0 # spider_url_fixed db = self.mongo_client.client["spider_url_fixed"] collection = db[entity_code] # 统计该实体所有数据 try: mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_fixed = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_fixed: self.name_dict["需爬取总量"] = mongo_data_list_fixed.count() else: self.name_dict["需爬取总量"] = 0 # # 统计该实体昨天数据 # try: # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # except pymongo.errors.ServerSelectionTimeoutError: # time.sleep(5) # fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}}, # {"DATETIME_": {"$lte": self.local_time}}]}) # if fixed_day_ago: # self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count() # else: # self.name_dict["hesternal_spider_url_fixed"] = 0 # spider_data db = self.mongo_client.client["spider_data"] collection = db[self.entity_type] self.mongo_client.mongo_entity_code = entity_code if len(self.__file_list) == 1: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.all_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 else: # 统计该实体所有数据 try: mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) mongo_data_list_data = self.mongo_client.search_from_mongodb( collection=collection) if mongo_data_list_data: self.name_dict["现有数据"] = mongo_data_list_data.count() else: self.name_dict["现有数据"] = 0 # 统计该实体昨天数据 try: data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 except pymongo.errors.ServerSelectionTimeoutError: time.sleep(5) data_day_ago = collection.find({ "$and": [{ "ENTITY_CODE_": entity_code }, { "DATETIME_": { "$gte": self.hesternal_time } }, { "DATETIME_": { "$lte": self.local_time } }] }) if data_day_ago: self.name_dict["昨日爬取数据"] = data_day_ago.count() else: self.name_dict["昨日爬取数据"] = 0 file_path = self.file_path + "/count_for_{}.csv".format( self.entity_type) self.save_to_csv(file_path) self.mongo_client.client_close() def run(self): if self.entity_type: self.count_from_database() else: for root, dirs, files in os.walk(self.__base_path + "/scripts"): # print(root) # 当前目录路径 # print(dirs) # 当前路径下所有子目录 # print(files) # 当前路径下所有非目录子文件 self.__type_list = dirs self.__type_list.remove("__pycache__") break # 中断 # test_index = self.__type_list.index("NEWS_FINASSIST") # self.__type_list = self.__type_list[test_index:] for _type in self.__type_list: print(_type) self.entity_type = _type self.__dir_path = self.__base_path + "/scripts/{}".format( _type) self.get_entity_code() self.count_from_database()
class MapBarTransfer(object): def __init__(self, table_name="CHA_BRANCH_MAPBAR", collection_name="mapbar"): # phoenix connection self.p_client = PhoenixHbase(table_name=table_name) self.connection = self.p_client.connect_to_phoenix() # MongoDB connection self.m_client = MongoClient(mongo_collection=collection_name, entity_code="MAPBAR_DEATAIL_BJ") self.m_client.mongo_host = "172.22.69.35" self.m_client.mongo_port = 20000 self.m_client.client = pymongo.MongoClient(host="172.22.69.35", port=20000, serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False) self.db, self.collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=self.db, collection_list=self.collection_list) # Log self.logger = Logger().logger # count self.count = 0 def main(self): # # 创建表 # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) # 获取数据 # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection) mongo_data_list = self.m_client.search_from_mongodb( collection=self.collection, field_name="DEALTIME_", field_value={"$gt": "1555136656.0579224"}, data_id="5cb65fac9bb3df61a09c6625") count = 0 while True: # 取一条处理 try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError: time.sleep(3) data = mongo_data_list.__next__() # 清洗 try: data["PHONE_"] = data["PHONE_"].replace("无,", "") u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日", data["UPDATETIME_"]) if u_time_list: u_ = u_time_list[0].replace("年", "-") u_ = u_.replace("月", "-") u_l = u_.split("-") if len(u_l[1]) == 1: u_l[1] = "0" + u_l[1] if len(u_l[2]) == 1: u_l[2] = "0" + u_l[2] data["UPDATETIME_"] = "-".join(u_l) except Exception as e: self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}") continue # 获取经纬度 try: if data["ADDRESS_"]: data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:]) location_result = get_lat_lng(address=data["ADDRESS_"]) if location_result["status"] == 0: data["LNG_"] = str( location_result["result"]["location"]["lng"]) data["LAT_"] = str( location_result["result"]["location"]["lat"]) else: self.logger.warning(f"_id: {data['_id']} 获取经纬度失败") else: continue except Exception as e: self.logger.exception( f"_id: {data['_id']} 获取经纬度失败, error: {e}") continue # upsert to HBase try: re_data = self.__check_lat(data=data) # 向 HBase 中插入一条 self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) count += 1 if count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}") except Exception as e: self.logger.exception( f"HBase 插入失败, _id: {data['_id']}, error: {e}") continue # 关闭 MongoDB cursor mongo_data_list.close() self.logger.info( f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条" ) def check_lat(self): # # 删除表 # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR") # # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,' # '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,' # '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,' # '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,' # '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar' # ') IMMUTABLE_ROWS = true') # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) self.p_client.table_name = "FANSILE" data_cursor = self.p_client.search_all_from_phoenix( connection=self.connection, dict_status=True) self.p_client.table_name = "CHA_BRANCH_MAPBAR" while True: try: data = data_cursor.__next__() # del data["('C', 'CHECK_LNG_')"] # if not data["LAT_"]: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # pass # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue # else: # self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}") # data["CHECK_LAT_"] = data["LAT_"] # data["CHECK_LNG_"] = data["LNG_"] # data["LAT_"] = "" # data["LNG_"] = "" # self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data) # continue self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=data) self.count += 1 if self.count % 100 == 0: self.logger.info( f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条") except StopIteration: break def __check_lat(self, data): if "LAT_" not in data: return data # 上海 # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864: # 北京 if 39.4498800000 < float(data["LAT_"]) < 41.1684980000: pass else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data # 上海 # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443: # 北京 if 115.4534230000 < float(data["LNG_"]) < 117.5461160000: return data else: self.logger.warning( f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}") data["CHECK_LAT_"] = data["LAT_"] data["CHECK_LNG_"] = data["LNG_"] data["LAT_"] = "" data["LNG_"] = "" return data