Exemple #1
0
    def run(self):
        # 创建 MongoDB 对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = self.code_list[14:]
        for entity_code in self.code_list:
            self.count = 0
            hash_list = list()
            status = False
            mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                       collection=collection,
                                                       entity_code=entity_code)
            if mongo_data_list:
                self.logger.warning("{} 查取成功".format(entity_code))
                self.logger.warning("当前共有{}条".format(mongo_data_list.count()))
                status = True
            else:
                self.logger.warning("{} 无数据".format(entity_code))

            if status:
                for data in mongo_data_list:
                    if "ADDR_" in data:
                        hash_m = hashlib.md5()
                        hash_m.update(data["ADDR_"].encode("utf-8"))
                        hash_title = hash_m.hexdigest()
                        if hash_title in hash_list:
                            self.count += 1
                        else:
                            hash_list.append(hash_title)
                    else:
                        if "CONTENT_" in data:
                            hash_m = hashlib.md5()
                            hash_m.update(data["CONTENT_"].encode("utf-8"))
                            hash_title = hash_m.hexdigest()
                            if hash_title in hash_list:
                                self.count += 1
                            else:
                                hash_list.append(hash_title)
            self.logger.warning("重复数据{}条".format(self.count))

        # 关闭连接
        m_client.client_close()
        self.logger.handlers.clear()
Exemple #2
0
    def run(self):
        count = 0
        # # 创建 Phoenix 对象-注意表格名字
        p_client = PhoenixHbase(table_name="FUND")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="JSFUND_CCBDATA")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        #查询省市区的编码列表
        # script = GenericScript(entity_code="ICBCFUND", entity_type="JSFUND_CCBDATA")
        # province_list, city_list, area_list, dir_area_list = script.area_from_mysql()
        list_SUBS_STATUS = self.dict_from_mysql("FUND_SUBS_STATUS")
        list_TYPE = self.dict_from_mysql("FUND_TYPE")

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 基金表创建语句
        # sql = ('create table "FUND" ("ID_" varchar primary key,"C"."ENTITY_CODE_" varchar,"C"."AREA_CODE_" varchar,'
        #     '"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,"C"."UNIT_CODE_" varchar,"C"."PERIOD_CODE_" varchar,"C"."REMARK_" varchar,'
        #     '"C"."CREATE_TIME_" varchar,"C"."UPDATE_TIME_" varchar,"C"."STATUS_" varchar,"C"."CODE_" varchar,"C"."NAME_" varchar,'
        #     '"C"."FUND_OLD_VALUE_" varchar,"C"."TOTAL_OLD_VALUE_" varchar,"C"."FUND_NEW_VALUE_" varchar,"C"."TOTAL_NEW_VALUE_" varchar,'
        #     '"C"."INVEST_PERIOD_" varchar,"C"."DAILY_RATE_" varchar,"C"."YEAR_REWARD_" varchar,"C"."SUBS_STATUS_" varchar,'
        #     '"C"."ATONEM_STATUS_" varchar,"C"."TYPE_" varchar,"C"."NEWEST_VALUE_" varchar,"C"."TOTAL_VALUE_" varchar,'
        #     '"C"."POPULARITY_" varchar,"C"."RATING_" varchar,"C"."ENTITY_NAME_" varchar,"C"."OLD_VALUE_" varchar,'
        #     '"C"."UNIT_VALUE_" varchar,"C"."SCALE_" varchar,"C"."ESTABLISH_DATE_" varchar,"C"."RISK_LEVEL_" varchar,'
        #     '"C"."BASE_INFO_" varchar,"C"."YIELD_" varchar,"C"."INVEST_" varchar,"C"."MONTH_RATE_" varchar,'
        #     '"C"."QUARTER_RATE_" varchar,"C"."HALF_YEAR_RATE_" varchar,"C"."URL_" varchar,"C"."HISTORY_RATE_" varchar,'
        #     '"C"."FUND_STATUS_" varchar,"C"."COMPANY_" varchar,"C"."SUBS_STATUS_CODE_" varchar,"C"."TYPE_CODE_" varchar)IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        # re_data = module_name.data_shuffle(data_list,province_list, city_list, area_list,list_SUBS_STATUS,list_TYPE)
                        re_data = module_name.data_shuffle(
                            data, list_SUBS_STATUS, list_TYPE)
                        # re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # try:
                            #     area_data = self.shuffle_for_area(list_data)
                            # except Exception as e:
                            #     self.remove_id_list.remove(data_id)
                            #     self.copy_mongo_data_list.remove(copy_data)
                            #     self.logger.warning("_id:{} 获取经纬度失败, {}".format(data_id, e))
                            #     continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    count += 1
                                    print(count)
                                    # print(list_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=list_data)
                                    # 导出csv
                                    # pd.DataFrame(area_data).to_csv("E:\\NEWS_CLEAN_\\" + module_name+ ".csv")
                                    once_count += success_count
                                    self.success_count += success_count
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            success_count))
                                    if self.success_count % 50 == 0:
                                        update_count = m_client.update_to_mongodb(
                                            collection=collection,
                                            data_id=self.remove_id_list,
                                            data_dict={"d": 1})

                                        self.remove_count += update_count
                                        self.logger.info("MongoDB 更新成功")
                                except Exception as e:
                                    self.remove_id_list.remove(data_id)
                                    self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        # try:
                        # area_data = self.shuffle_for_area(re_data)
                        # except Exception as e:
                        #     self.remove_id_list.remove(data_id)
                        #     self.copy_mongo_data_list.remove(copy_data)
                        #     self.logger.warning("_id: {}获取经纬度失败, {}".format(data_id, e))
                        #     continue
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                if self.success_count % 100 == 0:
                                    self.logger.info(
                                        "HBase 插入成功, 成功条数 {} 条".format(
                                            self.success_count))
                                # 添加 {d:1}
                                if self.success_count % 50 == 0:
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=self.remove_id_list,
                                        data_dict={"d": 1})

                                    self.remove_count += update_count
                                    self.logger.info("MongoDB 更新成功")
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Exemple #3
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="FINPRODUCT_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="FINPRODUCT_FINASSIST")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["FINPRODUCT_FINASSIST"]

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 表创建语句
        # sql = ('create table "FINPRODUCT_FINASSIST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,"C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, '
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."TIME_LIMIT_" varchar,'
        #        '"C"."YIELD_RATE_" varchar, "C"."BREAKEVEN_" varchar, "C"."START_FUNDS_" varchar,'
        #        '"C"."INVEST_PERIOD_" varchar, "C"."SALE_DISTRICT_" varchar, "C"."SALE_START_" varchar,'
        #        '"C"."SALE_END_" varchar, "C"."RISK_LEVEL_" varchar, "C"."REDEMING_MODE_" varchar,'
        #        '"C"."PRIVATE_BANK_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C"."DATETIME_" varchar,'
        #        '"C"."ENTITY_NAME_" varchar, "C"."CURRENCY_TYPE_" varchar, "C"."INCREASE_UNIT_" varchar,'
        #        '"C"."YIELD_START_DATE_" varchar, "C"."YIELD_END_DATE_" varchar, "C"."YIELD_TYPE_" varchar,'
        #        '"C"."TARGET_" varchar, "C"."PRODUCT_TYPE_" varchar, "C"."YIELD_STATMENT_" varchar,'
        #        '"C"."INVEST_RANGE_" varchar, "C"."PRE_STOP_" varchar, "C"."RASE_PLAN_" varchar,'
        #        '"C"."PURCHASE_" varchar, "T"."CONTENT_" varchar, "C"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        #
        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        for entity in ["CHINANETFINANCIAL", "JSFIN_CCBDATA"]:
            # for entity in self.entity_list:
            status = False
            module_name = __import__(entity)
            self.logger.info("开始进行 ENTITY_CODE_: {}".format(entity))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            # find_id = "5c3f118f8d7fee068da6ef53"
            find_id = None
            try:
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                if entity == "JSFIN_CCBDATA":
                    m_client.mongo_collection = "JSFIN_CCBDATA"
                    mongo_data_list = module_name.ScriptCCB.get_data_from_mongo(self=self, m_client=m_client,
                                                                                collection=collection,
                                                                                data_id=None)
                else:
                    m_client.mongo_collection = "FINPRODUCT_FINASSIST"
                    collection = m_client.get_check_collection(db=db, collection_list=collection_list)
                    mongo_data_list = self.get_data_from_mongo(m_client=m_client,
                                                               collection=collection, entity_code=entity,
                                                               data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    copy_data = {}
                    self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        copy_data = deepcopy(data)
                        self.copy_mongo_data_list.append(copy_data)
                        if entity == "CHINANETFINANCIAL":
                            re_data = module_name.data_shuffle(data=data, sales_status=self.sales_status,
                                                               produc_category=self.produc_category,
                                                               revenue_type=self.revenue_type,
                                                               operaton_pattern=self.operaton_pattern,
                                                               purchase_amount=self.purchase_amount,
                                                               duration_type=self.duration_type)
                        elif entity == "JSFIN_CCBDATA":
                            re_data = module_name.ScriptCCB.data_shuffle(self=self, data=data)
                        else:
                            re_data = module_name.data_shuffle(data)

                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(data_id, e))
                        continue

                    print(data_id)

                    # phoenix_HBase 插入数据
                    if isinstance(re_data, dict):
                        try:
                            success_count = p_client.upsert_to_phoenix_by_one(connection=connection, data=re_data)
                            once_count += success_count
                            self.success_count += success_count
                            # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(data)
                            self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                            continue
                    elif isinstance(re_data, list):
                        for r_data in re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(connection=connection,
                                                                                  data=r_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                self.remove_id_list.remove(data_id)
                                self.copy_mongo_data_list.remove(data)
                                self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(data_id, e))
                                continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            break
            # # 删除数据
            # if status:
            #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                                entity_code=entity,
            #                                                remove_id_list=self.remove_id_list)
            #     self.remove_count += delete_count
            #     # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
            #
            # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

            # 关闭连接
        m_client.client_close()
        # p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Exemple #4
0
class Statistics(object):
    def __init__(self, entity_type=None):
        """
        初始化
        :param entity_type:
        """
        t = arrow.now()
        self.local_time = t.format("YYYY-MM-DD")
        h_t = t.shift(days=-1)
        self.hesternal_time = h_t.format("YYYY-MM-DD")
        self.entity_type = entity_type
        self.__base_path = os.path.abspath(os.path.dirname(__file__))
        self.__dir_path = self.__base_path + "/scripts/{}".format(
            self.entity_type)
        self.file_path = self.__base_path + "/statistics/{}".format(
            self.local_time)
        self.__type_list = list()
        self.__file_list = list()
        if self.entity_type:
            self.get_entity_code()
        self.mongo_client = MongoClient()
        # "hesternal_spider_url_temp": 0, "hesternal_spider_url_fixed": 0,
        self.name_dict = {
            "实体编码": "",
            "待爬数据": 0,
            "需爬取总量": 0,
            "现有数据": 0,
            "昨日爬取数据": 0
        }

    def get_entity_code(self):
        """
        获取目标目录下文件名(去除 "CommonBidding_" 后就是 ENTITY_CODE_ )
        :return:
        """
        for root, dirs, files in os.walk(self.__dir_path):
            # print(root)  # 当前目录路径
            # print(dirs)  # 当前路径下所有子目录
            # print(files)  # 当前路径下所有非目录子文件
            self.__file_list = files
            self.__file_list.remove("__init_____.py")
            break

    def save_to_csv(self, file_path):
        """
        save each count and save to csv
        :param file_path:
        :return:
        """
        if os.path.exists(file_path):
            with open(file_path, "a", newline="", errors="ignore") as f:
                writer = csv.writer(f)
                append_list = list()
                for key, value in self.name_dict.items():
                    append_list.append(value)
                writer.writerows([append_list])
        else:
            try:
                with open(file_path, "a", newline="", errors="ignore") as f:
                    writer = csv.writer(f)
                    check_list = list()
                    append_list = list()
                    for key, value in self.name_dict.items():
                        check_list.append(key)
                        append_list.append(value)
                    writer.writerows([check_list])
                    writer.writerows([append_list])
            except FileNotFoundError:
                os.makedirs(self.file_path)
                with open(file_path, "a", newline="", errors="ignore") as f:
                    writer = csv.writer(f)
                    check_list = list()
                    append_list = list()
                    for key, value in self.name_dict.items():
                        check_list.append(key)
                        append_list.append(value)
                    writer.writerows([check_list])
                    writer.writerows([append_list])

    def count_from_database(self):
        """
        count data for database "spider_url_temp", "spider_url_fixed", "spider_data" where entity_code == entity_type
        :return:
        """
        # test_index = self.__file_list.index("CommonBidding_650500HMSSY.py")
        # self.__file_list = self.__file_list[test_index:]
        for entity_code in self.__file_list:
            entity_code = entity_code.replace("CommonBidding_", "")
            entity_code = entity_code.replace(".py", "")
            print(entity_code)
            self.name_dict["实体编码"] = entity_code

            # spider_url_temp
            db = self.mongo_client.client["spider_url_temp"]
            collection = db[entity_code]
            # 统计该实体所有数据
            try:
                mongo_data_list_temp = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_temp:
                    self.name_dict["待爬数据"] = mongo_data_list_temp.count()
                else:
                    self.name_dict["待爬数据"] = 0
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(5)
                mongo_data_list_temp = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_temp:
                    self.name_dict["待爬数据"] = mongo_data_list_temp.count()
                else:
                    self.name_dict["待爬数据"] = 0

            # 统计该实体昨天数据
            # try:
            #     temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                              {"DATETIME_": {"$lte": self.local_time}}]})
            #     if temp_day_ago:
            #         self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_temp"] = 0
            # except pymongo.errors.ServerSelectionTimeoutError:
            #     time.sleep(5)
            #     temp_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                              {"DATETIME_": {"$lte": self.local_time}}]})
            #     if temp_day_ago:
            #         self.name_dict["hesternal_spider_url_temp"] = temp_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_temp"] = 0

            # spider_url_fixed
            db = self.mongo_client.client["spider_url_fixed"]
            collection = db[entity_code]

            # 统计该实体所有数据
            try:
                mongo_data_list_fixed = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_fixed:
                    self.name_dict["需爬取总量"] = mongo_data_list_fixed.count()
                else:
                    self.name_dict["需爬取总量"] = 0
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(5)
                mongo_data_list_fixed = self.mongo_client.all_from_mongodb(
                    collection=collection)
                if mongo_data_list_fixed:
                    self.name_dict["需爬取总量"] = mongo_data_list_fixed.count()
                else:
                    self.name_dict["需爬取总量"] = 0

            # # 统计该实体昨天数据
            # try:
            #     fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                               {"DATETIME_": {"$lte": self.local_time}}]})
            #     if fixed_day_ago:
            #         self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_fixed"] = 0
            # except pymongo.errors.ServerSelectionTimeoutError:
            #     time.sleep(5)
            #     fixed_day_ago = collection.find({"$and": [{"DATETIME_": {"$gte": self.hesternal_time}},
            #                                               {"DATETIME_": {"$lte": self.local_time}}]})
            #     if fixed_day_ago:
            #         self.name_dict["hesternal_spider_url_fixed"] = fixed_day_ago.count()
            #     else:
            #         self.name_dict["hesternal_spider_url_fixed"] = 0

            # spider_data
            db = self.mongo_client.client["spider_data"]
            collection = db[self.entity_type]
            self.mongo_client.mongo_entity_code = entity_code
            if len(self.__file_list) == 1:
                # 统计该实体所有数据
                try:
                    mongo_data_list_data = self.mongo_client.all_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    mongo_data_list_data = self.mongo_client.all_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                # 统计该实体昨天数据
                try:
                    data_day_ago = collection.find({
                        "$and": [{
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    data_day_ago = collection.find({
                        "$and": [{
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
            else:
                # 统计该实体所有数据
                try:
                    mongo_data_list_data = self.mongo_client.search_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    mongo_data_list_data = self.mongo_client.search_from_mongodb(
                        collection=collection)
                    if mongo_data_list_data:
                        self.name_dict["现有数据"] = mongo_data_list_data.count()
                    else:
                        self.name_dict["现有数据"] = 0
                # 统计该实体昨天数据
                try:
                    data_day_ago = collection.find({
                        "$and": [{
                            "ENTITY_CODE_": entity_code
                        }, {
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(5)
                    data_day_ago = collection.find({
                        "$and": [{
                            "ENTITY_CODE_": entity_code
                        }, {
                            "DATETIME_": {
                                "$gte": self.hesternal_time
                            }
                        }, {
                            "DATETIME_": {
                                "$lte": self.local_time
                            }
                        }]
                    })
                    if data_day_ago:
                        self.name_dict["昨日爬取数据"] = data_day_ago.count()
                    else:
                        self.name_dict["昨日爬取数据"] = 0

            file_path = self.file_path + "/count_for_{}.csv".format(
                self.entity_type)
            self.save_to_csv(file_path)
        self.mongo_client.client_close()

    def run(self):
        if self.entity_type:
            self.count_from_database()
        else:
            for root, dirs, files in os.walk(self.__base_path + "/scripts"):
                # print(root)  # 当前目录路径
                # print(dirs)  # 当前路径下所有子目录
                # print(files)  # 当前路径下所有非目录子文件
                self.__type_list = dirs
                self.__type_list.remove("__pycache__")
                break
            # 中断
            # test_index = self.__type_list.index("NEWS_FINASSIST")
            # self.__type_list = self.__type_list[test_index:]
            for _type in self.__type_list:
                print(_type)
                self.entity_type = _type
                self.__dir_path = self.__base_path + "/scripts/{}".format(
                    _type)
                self.get_entity_code()
                self.count_from_database()
Exemple #5
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="CommonBidding")
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="CommonBidding")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # 创建 MongoDB spider_data_old 数据库对象
        old_client = MongoClient(mongo_collection="CommonBidding")
        # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017,
        #                                         serverSelectionTimeoutMS=60, connectTimeoutMS=60, connect=False)
        old_client.mongo_db = "spider_data_old"
        db_old, collection_list_old = old_client.client_to_mongodb()
        collection_old = db_old["CommonBidding"]

        # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # 招投标表创建语句
        # sql = ('create table "CommonBidding" ("ID_" varchar primary key, "F"."CONTENT_" varchar,'
        #        '"F"."NOTICE_TIME_" varchar,"F"."TITLE_" varchar,"F"."PROJECT_NAME_" varchar,'
        #        '"F"."BID_CONTENT_" varchar, "F"."SIGN_START_TIME_" varchar, "F"."SIGN_END_TIME_" varchar,'
        #        '"F"."OPEN_BID_TIME_" varchar, "F"."OPEN_BID_PLACE_" varchar, "F"."BID_AGENCY_" varchar,'
        #        '"F"."APPLY_CONDITION_" varchar, "F"."SIGN_QUALIFICATION_" varchar, "F"."PROJECT_ID_" varchar,'
        #        '"F"."WIN_CANDIDATE_" varchar, "F"."CANDIDATE_RANK_" varchar, "F"."BID_" varchar,"F"."URL_" varchar,'
        #        '"F"."DEALTIME_" varchar, "F"."ENTITY_NAME_" varchar, "F"."ENTITY_CODE_" varchar,'
        #        '"F"."ENTITY_STATUS_" varchar, "F"."SIGN_MATERIAL_" varchar, "F"."BID_TYPE_" varchar,'
        #        '"F"."DATETIME_" varchar, "F"."BUDGET_PRICE_" varchar, "F"."PASS_REASON_" varchar,'
        #        '"F"."PRESALE_CONTENT_" varchar, "F"."PRESALE_WAY_" varchar,"F"."PRESALE_START_TIME_" varchar,'
        #        '"F"."PRESALE_END_TIME_" varchar,"F"."PRESALE_ADDR_" varchar,"F"."PRESALE_PREPARE_" varchar,'
        #        '"F"."IMAGE_" varchar) IMMUTABLE_ROWS = true')
        # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        # self.file_list = ["CommonBidding_86JCW"]
        for f in self.file_list:
            status = False
            entity_code = f.replace(".py", "")
            module_name = __import__(entity_code)
            entity_code_mongo = entity_code.replace("CommonBidding_", "")
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code_mongo))
            self.remove_id_list = []
            self.copy_mongo_data_list = []
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)
            except pymongo.errors.ServerSelectionTimeoutError:
                time.sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code_mongo)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                try:
                    self.find_count += mongo_data_list.count()
                except pymongo.errors.ServerSelectionTimeoutError:
                    time.sleep(1)
                    self.find_count += mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    self.remove_id_list.append(data_id)
                    del data["_id"]
                    # 深拷贝源数据,用于插入 spider_data 库中
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    # 数据清洗
                    try:
                        re_data = module_name.data_shuffle(data)
                        final_data = self.shuffle_data(re_data)
                    except Exception as e:
                        self.remove_id_list.remove(data_id)
                        self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue
                    # phoenix_HBase 插入数据
                    if final_data:
                        try:
                            p_client.upsert_to_phoenix_by_one(
                                connection=connection, data=final_data)
                            once_count += 1
                        except Exception as e:
                            self.remove_id_list.remove(data_id)
                            self.copy_mongo_data_list.remove(copy_data)
                            self.logger.warning(
                                "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                    data_id, e))
                            continue
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))

                # 删除数据
                if status:
                    delete_count = self.delete_data_from_mongo(
                        m_client=m_client,
                        collection=collection,
                        entity_code=entity_code_mongo,
                        remove_id_list=self.remove_id_list)
                    self.remove_count += delete_count
                else:
                    self.logger.info("HBase 插入成功条数0条, 不执行删除")

                # 将数据插入 spider_data_old 中
                if status:
                    try:
                        old_client.mongo_db = "spider_data_old"
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except pymongo.errors.ServerSelectionTimeoutError as e:
                        time.sleep(1)
                        self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
                        insert_count = old_client.all_to_mongodb(
                            collection=collection_old,
                            insert_list=self.copy_mongo_data_list)
                        self.old_count += insert_count
                        # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
                    except Exception as e:
                        self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(p_client.count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.handlers.clear()
Exemple #6
0
    def run(self):
        # # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="NEWS_FINASSIST")
        p_client.verify_list = self.verify_list
        # # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="NEWS_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)
        #
        # # 表创建语句
        # sql = ('create table "NEWS_FINASSIST" ("ID_" varchar primary key, "T"."CONTENT_" varchar, '
        #        '"C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar, "C"."TITLE_" varchar, "C"."BRIEF_" varchar, '
        #        '"C"."PUBLISH_TIME_" varchar, "C"."KEYWORDS_" varchar, "C"."URL_" varchar, "C"."DATA_SOURCE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar,'
        #        '"C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar,'
        #        '"C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar, "F"."STATUS_" varchar)'
        #        'IMMUTABLE_ROWS = true')

        # # 创建表
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            # self.remove_id_list = []
            # self.copy_mongo_data_list = []
            # self.branch_code_list = []
            if entity_code == "CAIJINGNEWS":
                find_id = "5c6bfa508d7fee512a4ca68f"
            else:
                find_id = ""
            # find_id = ""
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    find_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for i in range(1000000):
                    try:
                        data = mongo_data_list.__next__()
                    except pymongo.errors.ServerSelectionTimeoutError:
                        continue
                    except StopIteration:
                        break

                    # for data in mongo_data_list:
                    data_id = data["_id"]
                    if self.success_count % 100 == 0:
                        self.logger.info(
                            "running on data_id: {}".format(data_id))
                    # print(data_id)
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        data_list = [data]
                        re_data = module_name.data_shuffle(data_list)

                        if not re_data:
                            self.bad_count += 1
                            # self.remove_id_list.remove(data_id)
                            continue
                    except Exception as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            # phoenix_HBase 插入数据
                            if list_data:
                                try:
                                    if entity_code != "CNINFONEWS":
                                        ai_data = self.get_brief_from_ai(
                                            data=list_data)
                                    else:
                                        ai_data = list_data
                                    # print(ai_data["CONTENT_"])
                                except Exception as e:
                                    self.logger.info("AI 调取失败, 错误信息", e)
                                    ai_data = re_data
                                try:
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=ai_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    if self.success_count % 10 == 0:
                                        self.logger.info(
                                            "HBase 插入成功, 成功条数{}条".format(
                                                once_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                                try:
                                    # 添加 {d:1}
                                    update_count = m_client.update_to_mongodb(
                                        collection=collection,
                                        data_id=data_id,
                                        data_dict={"d": 1})
                                    self.remove_count += update_count
                                    # self.logger.info("MongoDB 更新成功")
                                    if self.remove_count % 10 == 0:
                                        self.logger.info(
                                            "MongoDB 更新成功, 成功条数 {} 条".format(
                                                "10"))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.warning(
                                        "MongoDB 更新 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue

                    elif isinstance(re_data, dict):
                        # phoenix_HBase 插入数据
                        if re_data:
                            try:
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=re_data)
                                once_count += success_count
                                self.success_count += success_count
                                self.logger.info(
                                    "HBase 插入成功, 成功条数 {} 条".format(
                                        success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.warning(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue

                if once_count > 0:
                    status = True
                    self.logger.info("ENTITY_CODE_: {} 插入成功条数 {}".format(
                        entity_code, once_count))
                mongo_data_list.close()
            else:
                continue

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Exemple #7
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="WEIBOBASICINFO")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="WEIBOBASICINFO")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None, entity_type=None).area_from_mysql())

        # 删除表
        p_client.drop_table_phoenix(connection=connection)
        # quit()

        # 创建表
        sql = (
            'create table "WEIBOBASICINFO" ("ID_" varchar primary key, "C"."BANK_CODE_" varchar,'
            '"C"."BANK_NAME_" varchar, "C"."PERIOD_CODE_" varchar, "C"."CREATE_TIME_" varchar,'
            '"C"."UPDATE_TIME_" varchar, "C"."REMARK_" varchar, "C"."WEIBO_CODE_" varchar, "C"."MAIN_URL_" varchar,'
            '"C"."NAME_" varchar, "C"."FOCUS_" varchar, "C"."FANS_" varchar, "C"."COMPANY_URL_" varchar,'
            '"C"."COMPANY_" varchar, "C"."DETAILED_URL_" varchar, "C"."VIRIFIED_" varchar,"C"."AREA_CODE_" varchar,'
            '"C"."BIREF_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
            '"C"."DEALTIME_" varchar,"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar,'
            '"C"."STATUS_" varchar) IMMUTABLE_ROWS = true')
        p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 增加列
        # p_client.add_column_phoenix(connection=connection, column="IMAGE_")

        # 遍历 ENTITY_CODE_ 列表
        status = False
        self.logger.info("开始进行 WEIBOBASICINFO")

        try:
            mongo_data_list = m_client.all_from_mongodb(collection=collection)
        except pymongo.errors.ServerSelectionTimeoutError:
            time.sleep(1)
            mongo_data_list = m_client.all_from_mongodb(collection=collection)

        # 清洗数据并插入 HBase
        if mongo_data_list:
            self.find_count = mongo_data_list.count()
            for data in mongo_data_list:
                re_data = ""
                data_id = data["_id"]
                copy_data = {}
                self.remove_id_list.append(data_id)
                try:
                    del data["_id"]
                    copy_data = deepcopy(data)
                    self.copy_mongo_data_list.append(copy_data)
                    re_data = self.data_shuffle(data=data,
                                                province_list=province_list)
                    if not re_data:
                        self.bad_count += 1
                        continue
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("清洗错误,错误 _id 为{}, {}".format(
                        data_id, e))

                # phoenix_HBase 插入数据
                try:
                    success_count = p_client.upsert_to_phoenix_by_one(
                        connection=connection, data=re_data)
                    self.success_count += success_count
                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                except Exception as e:
                    self.remove_id_list.remove(data_id)
                    self.copy_mongo_data_list.remove(copy_data)
                    self.logger.warning("HBase 插入 _id 为 {} 的数据失败, {}".format(
                        data_id, e))
                    continue
            if self.success_count > 0:
                status = True
                self.logger.info("HBase 插入成功, 成功条数 {}".format(
                    self.success_count))
        else:
            quit()

        # # 删除数据
        # if status:
        #     delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
        #                                                remove_id_list=self.remove_id_list)
        #     self.remove_count += delete_count
        # else:
        #     self.logger.info("HBase 插入成功条数0条, 不执行删除")
        #
        # # 将数据插入 spider_data_old 中
        # if status:
        #     try:
        #         old_client.mongo_db = "spider_data_old"
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #     except pymongo.errors.ServerSelectionTimeoutError as e:
        #         time.sleep(1)
        #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
        #         insert_count = old_client.all_to_mongodb(collection=collection_old,
        #                                                  insert_list=self.copy_mongo_data_list)
        #         self.old_count += insert_count
        #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
        #     except Exception as e:
        #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()
Exemple #8
0
    def run(self):
        # 创建 Phoenix 对象
        p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST")
        p_client.verify_list = self.verify_list
        # 连接 Phoenix
        connection = p_client.connect_to_phoenix()
        # 创建 MongoDB 查询数据库对象
        m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        db, collection_list = m_client.client_to_mongodb()
        collection = m_client.get_check_collection(
            db=db, collection_list=collection_list)
        # # 创建 MongoDB spider_data_old 数据库对象
        # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST")
        # # 本地测试
        # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60,
        #                                         connectTimeoutMS=60, connect=False)
        # old_client.mongo_db = "spider_data_old"
        # db_old, collection_list_old = old_client.client_to_mongodb()
        # collection_old = db_old["ORGANIZE_FINASSIST"]

        # 获取地区编码
        province_list, city_list, area_list, dir_area_list = (GenericScript(
            entity_code=None,
            entity_type="ORGANIZE_FINASSIST").area_from_mysql())

        # # 删除表
        # p_client.drop_table_phoenix(connection=connection)

        # # 创建表
        # # 网点表创建语句
        # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,'
        #        '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,'
        #        '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,'
        #        '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,'
        #        '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,'
        #        '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,'
        #        '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,'
        #        '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,'
        #        '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true')
        #
        # p_client.create_new_table_phoenix(connection=connection, sql=sql)

        # 遍历 ENTITY_CODE_ 列表
        # self.code_list = ["ABCORGANIZE"]
        for entity_code in self.code_list:
            status = False
            module_name = __import__(entity_code)
            self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code))

            self.remove_id_list = []
            self.copy_mongo_data_list = []
            self.branch_code_list = []
            # find_id = ""
            if entity_code == "ECITICORGANIZE":
                find_id = "5c3f48479bb3df1d97d762e1"
            else:
                find_id = None
            try:
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)
            except pymongo.errors.ServerSelectionTimeoutError:
                sleep(1)
                mongo_data_list = self.get_data_from_mongo(
                    m_client=m_client,
                    collection=collection,
                    entity_code=entity_code,
                    data_id=find_id)

            # 清洗数据并插入 HBase
            if mongo_data_list:
                once_count = 0
                self.find_count = mongo_data_list.count()
                for data in mongo_data_list:
                    data_id = data["_id"]
                    # copy_data = {}
                    # self.remove_id_list.append(data_id)
                    try:
                        del data["_id"]
                        # copy_data = deepcopy(data)
                        # self.copy_mongo_data_list.append(copy_data)
                        re_data = module_name.data_shuffle(
                            data, province_list, city_list, area_list)
                        if not re_data:
                            self.bad_count += 1
                            continue
                    except Exception as e:
                        # except jpype._jexception.SQLExceptionPyRaisable as e:
                        # except org.apache.phoenix.exception.BatchUpdateExecution as e:
                        # self.remove_id_list.remove(data_id)
                        # self.copy_mongo_data_list.remove(copy_data)
                        self.logger.exception("清洗错误,错误 _id 为{}, {}".format(
                            data_id, e))
                        continue

                    print(data_id)

                    if isinstance(re_data, list):
                        for list_data in re_data:
                            area_data = ""
                            try:
                                # self.logger.info("_id {}".format(data_id))
                                area_data = self.shuffle_for_area(list_data)
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "_id:{} 获取经纬度失败, {}".format(data_id, e))
                                continue
                            # except ValueError:
                            #     pass
                            # phoenix_HBase 插入数据
                            if area_data:
                                try:
                                    # print(area_data)
                                    success_count = p_client.upsert_to_phoenix_by_one(
                                        connection=connection, data=area_data)
                                    once_count += success_count
                                    self.success_count += success_count
                                    # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                                except Exception as e:
                                    # self.remove_id_list.remove(data_id)
                                    # self.copy_mongo_data_list.remove(copy_data)
                                    self.logger.exception(
                                        "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                            data_id, e))
                                    continue
                    elif isinstance(re_data, dict):
                        area_data = ""
                        try:
                            area_data = self.shuffle_for_area(re_data)
                        except urllib3.exceptions.NewConnectionError as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                        except Exception as e:
                            # self.remove_id_list.remove(data_id)
                            # self.copy_mongo_data_list.remove(copy_data)
                            self.logger.exception("_id: {}获取经纬度失败, {}".format(
                                data_id, e))
                            continue
                        # phoenix_HBase 插入数据
                        if area_data:
                            try:
                                # print(area_data)
                                success_count = p_client.upsert_to_phoenix_by_one(
                                    connection=connection, data=area_data)
                                once_count += success_count
                                self.success_count += success_count
                                # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count))
                            except Exception as e:
                                # self.remove_id_list.remove(data_id)
                                # self.copy_mongo_data_list.remove(copy_data)
                                self.logger.exception(
                                    "HBase 插入 _id 为 {} 的数据失败, {}".format(
                                        data_id, e))
                                continue
                    if self.success_count % 100 == 0:
                        self.logger.info("HBase 插入成功, 成功条数 {} 条".format(
                            self.success_count))
                    # 添加 {d:1}
                    # if self.success_count % 50 == 0:
                    #     update_count = m_client.update_to_mongodb(collection=collection,
                    #                                               data_id=self.remove_id_list,
                    #                                               data_dict={"d": 1})
                    #     self.remove_id_list = []
                    #     self.remove_count += update_count
                    #     self.logger.info("MongoDB 更新成功")

                mongo_data_list.close()

                # 添加 {d:1}
                # if self.remove_id_list:
                #     update_count = m_client.update_to_mongodb(collection=collection,
                #                                               data_id=self.remove_id_list,
                #                                               data_dict={"d": 1})
                #     self.remove_id_list = []
                #     self.remove_count += update_count
                #     self.logger.info("MongoDB 更新成功")
                if once_count > 0:
                    status = True
                    self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count))
            else:
                continue
            # 删除数据
            # if status:
            # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection,
            #                                            entity_code=entity_code,
            #                                            remove_id_list=self.remove_id_list)
            # self.remove_count += delete_count
            # self.logger.info("MongoDB 删除成功")
            # else:
            #     self.logger.info("HBase 插入成功条数0条, 不执行删除")

            # # 将数据插入 spider_data_old 中
            # if status:
            #     try:
            #         old_client.mongo_db = "spider_data_old"
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except pymongo.errors.ServerSelectionTimeoutError as e:
            #         sleep(1)
            #         self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e))
            #         insert_count = old_client.all_to_mongodb(collection=collection_old,
            #                                                  insert_list=self.copy_mongo_data_list)
            #         self.old_count += insert_count
            #         # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count))
            #     except Exception as e:
            #         self.logger.info(e)

        # 关闭连接
        m_client.client_close()
        p_client.close_client_phoenix(connection=connection)
        self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count))
        self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count))
        self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count))
        self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count))
        self.logger.info("本次坏数据共 {} 条".format(self.bad_count))
        self.logger.handlers.clear()