Exemple #1
0
class Statistics(object):
    def __init__(self, entity_type=None):
        """
        初始化
        :param entity_type:
        """
        t = arrow.now()
        self.local_time = t.format("YYYY-MM-DD")
        h_t = t.shift(days=-1)
        self.hesternal_time = h_t.format("YYYY-MM-DD")
        self.entity_type = entity_type
        self.__base_path = os.path.abspath(os.path.dirname(__file__))
        self.__dir_path = self.__base_path + "/scripts/{}".format(
            self.entity_type)
        self.file_path = self.__base_path + "/statistics/{}".format(
            self.local_time)
        self.__type_list = list()
        self.__file_list = list()
        if self.entity_type:
            self.get_entity_code()
        self.mongo_client = MongoClient()
        # "hesternal_spider_url_temp": 0, "hesternal_spider_url_fixed": 0,
        self.name_dict = {
            "实体编码": "",
            "待爬数据": 0,
            "需爬取总量": 0,
            "现有数据": 0,
            "昨日爬取数据": 0
        }

    def get_entity_code(self):
        """
        获取目标目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ )
        :return:
        """
        for root, dirs, files in os.walk(self.__dir_path):
            # print(root)  # 当前目录路径
            # print(dirs)  # 当前路径下所有子目录
            # print(files)  # 当前路径下所有非目录子文件
            self.__file_list = files
            self.__file_list.remove("__init_____.py")
            break

    def save_to_csv(self, file_path):
        """
        save each count and save to csv
        :param file_path:
        :return:
        """
        if os.path.exists(file_path):
            with open(file_path, "a", newline="", errors="ignore") as f:
                writer = csv.writer(f)
                append_list = list()
                for key, value in self.name_dict.items():
                    append_list.append(value)
                writer.writerows([append_list])
        else:
            try:
                with open(file_path, "a", newline="", errors="ignore") as f:
                    writer = csv.writer(f)
                    check_list = list()
                    append_list = list()
                    for key, value in self.name_dict.items():
                        check_list.append(key)
                        append_list.append(value)
                    writer.writerows([check_list])
                    writer.writerows([append_list])
            except FileNotFoundError:
                os.makedirs(self.file_path)
                with open(file_path, "a", newline="", errors="ignore") as f:
                    writer = csv.writer(f)
                    check_list = list()
                    append_list = list()
                    for key, value in self.name_dict.items():
                        check_list.append(key)
                        append_list.append(value)
                    writer.writerows([check_list])
                    writer.writerows([append_list])

    def count_from_database(self):
        """
        count data for database "spider_url_temp", "spider_url_fixed", "spider_data" where entity_code == entity_type
        :return:
        """
        # test_index = self.__file_list.index("CommonBidding_650500HMSSY.py")
        # self.__file_list = self.__file_list[test_index:]
        for entity_code in self.__file_list:
            entity_code = entity_code.replace("CommonBidding_", "")
            entity_code = entity_code.replace(".py", "")
            print(entity_code)
            self.name_dict["实体编码"] = entity_code

            # spider_url_temp
            db = self.mongo_client.client["spider_url_temp"]
            collection = db[entity_code]
            # 统计该实体所有数据
            try:
                mongo_data_list_temp = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_temp:
                    self.name_dict["待爬数据"] = mongo_data_list_temp.count()
                else:
                    self.name_dict["待爬数据"] = 0
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(5)
                mongo_data_list_temp = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_temp:
                    self.name_dict["待爬数据"] = mongo_data_list_temp.count()
                else:
                    self.name_dict["待爬数据"] = 0

            # 统计该实体昨天数据
            # try:
            #     temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                              {"DATETIME_": {"$lte": self.local_time}}]})
            #     if temp_day_ago:
            #         self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_temp"] = 0
            # except pymongo.errors.ServerSelectionTimeoutError:
            #     time.sleep(5)
            #     temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                              {"DATETIME_": {"$lte": self.local_time}}]})
            #     if temp_day_ago:
            #         self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_temp"] = 0

            # spider_url_fixed
            db = self.mongo_client.client["spider_url_fixed"]
            collection = db[entity_code]

            # 统计该实体所有数据
            try:
                mongo_data_list_fixed = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_fixed:
                    self.name_dict["需爬取总量"] = mongo_data_list_fixed.count()
                else:
                    self.name_dict["需爬取总量"] = 0
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(5)
                mongo_data_list_fixed = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_fixed:
                    self.name_dict["需爬取总量"] = mongo_data_list_fixed.count()
                else:
                    self.name_dict["需爬取总量"] = 0

            # # 统计该实体昨天数据
            # try:
            #     fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                               {"DATETIME_": {"$lte": self.local_time}}]})
            #     if fixed_day_ago:
            #         self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_fixed"] = 0
            # except pymongo.errors.ServerSelectionTimeoutError:
            #     time.sleep(5)
            #     fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                               {"DATETIME_": {"$lte": self.local_time}}]})
            #     if fixed_day_ago:
            #         self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_fixed"] = 0

            # spider_data
            db = self.mongo_client.client["spider_data"]
            collection = db[self.entity_type]
            self.mongo_client.mongo_entity_code = entity_code
            if len(self.__file_list) == 1:
                # 统计该实体所有数据
                try:
                    mongo_data_list_data = self.mongo_client.all_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    mongo_data_list_data = self.mongo_client.all_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                # 统计该实体昨天数据
                try:
                    data_day_ago = collection.find({
                        "$and": [{
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    data_day_ago = collection.find({
                        "$and": [{
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
            else:
                # 统计该实体所有数据
                try:
                    mongo_data_list_data = self.mongo_client.search_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    mongo_data_list_data = self.mongo_client.search_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                # 统计该实体昨天数据
                try:
                    data_day_ago = collection.find({
                        "$and": [{
                            "ENTITY_CODE_": entity_code
                        }, {
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    data_day_ago = collection.find({
                        "$and": [{
                            "ENTITY_CODE_": entity_code
                        }, {
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0

            file_path = self.file_path + "/count_for_{}.csv".format(
                self.entity_type)
            self.save_to_csv(file_path)
        self.mongo_client.client_close()

    def run(self):
        if self.entity_type:
            self.count_from_database()
        else:
            for root, dirs, files in os.walk(self.__base_path + "/scripts"):
                # print(root)  # 当前目录路径
                # print(dirs)  # 当前路径下所有子目录
                # print(files)  # 当前路径下所有非目录子文件
                self.__type_list = dirs
                self.__type_list.remove("__pycache__")
                break
            # 中断
            # test_index = self.__type_list.index("NEWS_FINASSIST")
            # self.__type_list = self.__type_list[test_index:]
            for _type in self.__type_list:
                print(_type)
                self.entity_type = _type
                self.__dir_path = self.__base_path + "/scripts/{}".format(
                    _type)
                self.get_entity_code()
                self.count_from_database()
Exemple #2
0
class MapBarTransfer(object):
    def __init__(self,
                 table_name="CHA_BRANCH_MAPBAR",
                 collection_name="mapbar"):
        # phoenix connection
        self.p_client = PhoenixHbase(table_name=table_name)
        self.connection = self.p_client.connect_to_phoenix()
        # MongoDB connection
        self.m_client = MongoClient(mongo_collection=collection_name,
                                    entity_code="MAPBAR_DEATAIL_BJ")
        self.m_client.mongo_host = "172.22.69.35"
        self.m_client.mongo_port = 20000
        self.m_client.client = pymongo.MongoClient(host="172.22.69.35",
                                                   port=20000,
                                                   serverSelectionTimeoutMS=60,
                                                   connectTimeoutMS=60,
                                                   connect=False)
        self.db, self.collection_list = self.m_client.client_to_mongodb()
        self.collection = self.m_client.get_check_collection(
            db=self.db, collection_list=self.collection_list)
        # Log
        self.logger = Logger().logger
        # count
        self.count = 0

    def main(self):
        # # 创建表
        # table_sql = (f'create table "{self.p_client.table_name}" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        # 获取数据
        # mongo_data_list = self.m_client.all_from_mongodb(collection=self.collection)
        mongo_data_list = self.m_client.search_from_mongodb(
            collection=self.collection,
            field_name="DEALTIME_",
            field_value={"$gt": "1555136656.0579224"},
            data_id="5cb65fac9bb3df61a09c6625")

        count = 0
        while True:
            # 取一条处理
            try:
                data = mongo_data_list.__next__()
            except StopIteration:
                break
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(3)
                data = mongo_data_list.__next__()

            # 清洗
            try:
                data["PHONE_"] = data["PHONE_"].replace("无,", "")
                u_time_list = re.findall(r"(\d{4}年\d{1,2}月\d{1,2})日",
                                         data["UPDATETIME_"])
                if u_time_list:
                    u_ = u_time_list[0].replace("年", "-")
                    u_ = u_.replace("月", "-")
                    u_l = u_.split("-")
                    if len(u_l[1]) == 1:
                        u_l[1] = "0" + u_l[1]
                    if len(u_l[2]) == 1:
                        u_l[2] = "0" + u_l[2]
                    data["UPDATETIME_"] = "-".join(u_l)
            except Exception as e:
                self.logger.exception(f"数据清洗出错, _id: {data['_id']}, error {e}")
                continue

            # 获取经纬度
            try:
                if data["ADDRESS_"]:
                    data["ADDRESS_"] = "".join(data["ADDRESS_"].split("|")[1:])
                    location_result = get_lat_lng(address=data["ADDRESS_"])
                    if location_result["status"] == 0:
                        data["LNG_"] = str(
                            location_result["result"]["location"]["lng"])
                        data["LAT_"] = str(
                            location_result["result"]["location"]["lat"])
                    else:
                        self.logger.warning(f"_id: {data['_id']} 获取经纬度失败")
                else:
                    continue
            except Exception as e:
                self.logger.exception(
                    f"_id: {data['_id']} 获取经纬度失败, error: {e}")
                continue
            # upsert to HBase
            try:
                re_data = self.__check_lat(data=data)
                # 向 HBase 中插入一条
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=re_data)
                count += 1
                if count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {count}")
            except Exception as e:
                self.logger.exception(
                    f"HBase 插入失败, _id: {data['_id']}, error: {e}")
                continue

        # 关闭 MongoDB cursor
        mongo_data_list.close()
        self.logger.info(
            f"collection: {self.m_client.mongo_collection} 的数据清洗完毕, 成功条数共计: {count} 条"
        )

    def check_lat(self):
        # # 删除表
        # self.p_client.drop_table_phoenix(connection=self.connection, table_name="CHA_BRANCH_MAPBAR")
        #
        # table_sql = (f'create table "CHA_BRANCH_MAPBAR" ("ID_" varchar primary key,'
        #              '"C"."BTYPE_" varchar, "C"."TYPE_" varchar, "C"."NAME_" varchar, "C"."UPDATETIME_" varchar,'
        #              '"C"."ADDRESS_" varchar, "C"."POINAME_" varchar, "C"."PHONE_" varchar, "C"."BUSSTOP_" varchar,'
        #              '"C"."BUS_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #              '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."LAT_" varchar, "C"."LNG_" varchar,'
        #              '"C"."CHECK_LAT_" varchar, "C"."CHECK_LNG_" varchar'
        #              ') IMMUTABLE_ROWS = true')
        # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql)

        self.p_client.table_name = "FANSILE"
        data_cursor = self.p_client.search_all_from_phoenix(
            connection=self.connection, dict_status=True)
        self.p_client.table_name = "CHA_BRANCH_MAPBAR"
        while True:
            try:
                data = data_cursor.__next__()

                # del data["('C', 'CHECK_LNG_')"]
                # if not data["LAT_"]:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
                #     pass
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                # else:
                #     self.logger.warning(f"错误 _id: {data['ID_']}, 经纬度: {data['LAT_']},{data['LNG_']}")
                #     data["CHECK_LAT_"] = data["LAT_"]
                #     data["CHECK_LNG_"] = data["LNG_"]
                #     data["LAT_"] = ""
                #     data["LNG_"] = ""
                #     self.p_client.upsert_to_phoenix_by_one(connection=self.connection, data=data)
                #     continue
                self.p_client.upsert_to_phoenix_by_one(
                    connection=self.connection, data=data)
                self.count += 1
                if self.count % 100 == 0:
                    self.logger.info(
                        f"HBase 插入成功, _id: {data['_id']}, 成功条数 {self.count} 条")

            except StopIteration:
                break

    def __check_lat(self, data):
        if "LAT_" not in data:
            return data
        # 上海
        # if 30.7083860773 < float(data["LAT_"]) < 31.8739003864:
        # 北京
        if 39.4498800000 < float(data["LAT_"]) < 41.1684980000:
            pass
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data
        # 上海
        # if 120.8778122800 < float(data["LNG_"]) < 122.1248433443:
        # 北京
        if 115.4534230000 < float(data["LNG_"]) < 117.5461160000:
            return data
        else:
            self.logger.warning(
                f"错误 _id: {data['_id']}, 经纬度: {data['LAT_']},{data['LNG_']}")
            data["CHECK_LAT_"] = data["LAT_"]
            data["CHECK_LNG_"] = data["LNG_"]
            data["LAT_"] = ""
            data["LNG_"] = ""
            return data