# -*- coding: utf-8 -*- """ GDSZ_SZS_TZJG_XMGS """ from database._mongodb import MongoClient from tools.req_for_wordExcelZip import find_type def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="GDSZ_SZS_TZJG_XMGS", mongo_collection="GOV_ZX_GDS") data_list = main_mongo.main() for data in data_list[:2]: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- """ GDSZ_HYS_FGW_XMXX""" from database._mongodb import MongoClient from tools.req_for_wordExcelZip import find_type def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="GDSZ_HYS_FGW_XMXX", mongo_collection="GOV_ZX_GDS") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_ZYCJ_ZYW_HG", mongo_collection="ZX_CJXW_ZYCJ") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_SDW_ZZXW", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="JRCP_LCCP_ZGGSYH_APP_LCLB", mongo_collection="JRCP_LCCP") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
def run(): main_mongo = MongoClient(entity_code="ZTB_JSDFNCSYYH", mongo_collection="CommonBidding") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGNYE_ZJGD", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# if not any([num in item for num in ['供应链', '产业链', '应收账款', '应收款']]): if not any([num in item for num in ['项目', '公告']]): return False else: # if not any([num in item for num in ['微信', '营销', 'APP', 'app', 'App']]): # return False # else: # return True return True if __name__ == '__main__': import pandas as pd main_mongo = MongoClient(entity_code="", mongo_collection="CommonBidding") db, collection_list = main_mongo.client_to_mongodb() collection = main_mongo.get_check_collection( db=db, collection_list=collection_list) # mon_list = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'] # mon_list = ['12', '01'] # mon_list = ['08', '09', '10', '11', '12'] try: data_list = collection.find( { '$and': [ { 'NOTICE_TIME_': { '$gte': '2019-01-01' }
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGHGW_NHHYDT", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="WD_JT_GJ_GJWZD_NB", mongo_collection="WD_JT_GJ") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
def run(self): # 创建 Phoenix 对象 p_client = PhoenixHbase(table_name="ORGANIZE_FINASSIST") p_client.verify_list = self.verify_list # 连接 Phoenix connection = p_client.connect_to_phoenix() # 创建 MongoDB 查询数据库对象 m_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") db, collection_list = m_client.client_to_mongodb() collection = m_client.get_check_collection( db=db, collection_list=collection_list) # # 创建 MongoDB spider_data_old 数据库对象 # old_client = MongoClient(mongo_collection="ORGANIZE_FINASSIST") # # 本地测试 # old_client.client = pymongo.MongoClient(host="localhost", port=27017, serverSelectionTimeoutMS=60, # connectTimeoutMS=60, connect=False) # old_client.mongo_db = "spider_data_old" # db_old, collection_list_old = old_client.client_to_mongodb() # collection_old = db_old["ORGANIZE_FINASSIST"] # 获取地区编码 province_list, city_list, area_list, dir_area_list = (GenericScript( entity_code=None, entity_type="ORGANIZE_FINASSIST").area_from_mysql()) # # 删除表 # p_client.drop_table_phoenix(connection=connection) # # 创建表 # # 网点表创建语句 # sql = ('create table "ORGANIZE_FINASSIST" ("ID_" varchar primary key, "C"."BANK_NAME_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."NAME_" varchar,' # '"C"."CODE_" varchar, "C"."ENTITY_NAME_" varchar, "C"."ENTITY_CODE_" varchar,' # '"C"."AREA_CODE_" varchar, "C"."UNIT_CODE_" varchar, "C"."ADDR_" varchar,' # '"C"."PROVINCE_NAME_" varchar, "C"."PROVINCE_CODE_" varchar, "C"."CITY_" varchar,' # '"C"."CITY_CODE_" varchar, "C"."DISTRICT_NAME_" varchar, "C". "DISTRICT_CODE_" varchar,' # '"C"."LAT_" varchar, "C"."LNG_" varchar, "C"."CREATE_TIME_" varchar, "C"."DEALTIME_" varchar,' # '"C"."URL_" varchar, "C"."TEL_" varchar, "C"."BUSINESS_HOURS_" varchar, "C"."STATUS_" varchar,' # '"C"."IMPORTANCE_" varchar) IMMUTABLE_ROWS = true') # # p_client.create_new_table_phoenix(connection=connection, sql=sql) # 遍历 ENTITY_CODE_ 列表 # self.code_list = ["ABCORGANIZE"] for entity_code in self.code_list: status = False module_name = __import__(entity_code) self.logger.info("开始进行 ENTITY_CODE_ {}".format(entity_code)) self.remove_id_list = [] self.copy_mongo_data_list = [] self.branch_code_list = [] # find_id = "" if entity_code == "ECITICORGANIZE": find_id = "5c3f48479bb3df1d97d762e1" else: find_id = None try: mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) except pymongo.errors.ServerSelectionTimeoutError: sleep(1) mongo_data_list = self.get_data_from_mongo( m_client=m_client, collection=collection, entity_code=entity_code, data_id=find_id) # 清洗数据并插入 HBase if mongo_data_list: once_count = 0 self.find_count = mongo_data_list.count() for data in mongo_data_list: data_id = data["_id"] # copy_data = {} # self.remove_id_list.append(data_id) try: del data["_id"] # copy_data = deepcopy(data) # self.copy_mongo_data_list.append(copy_data) re_data = module_name.data_shuffle( data, province_list, city_list, area_list) if not re_data: self.bad_count += 1 continue except Exception as e: # except jpype._jexception.SQLExceptionPyRaisable as e: # except org.apache.phoenix.exception.BatchUpdateExecution as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("清洗错误,错误 _id 为{}, {}".format( data_id, e)) continue print(data_id) if isinstance(re_data, list): for list_data in re_data: area_data = "" try: # self.logger.info("_id {}".format(data_id)) area_data = self.shuffle_for_area(list_data) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "_id:{} 获取经纬度失败, {}".format(data_id, e)) continue # except ValueError: # pass # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue elif isinstance(re_data, dict): area_data = "" try: area_data = self.shuffle_for_area(re_data) except urllib3.exceptions.NewConnectionError as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception("_id: {}获取经纬度失败, {}".format( data_id, e)) continue # phoenix_HBase 插入数据 if area_data: try: # print(area_data) success_count = p_client.upsert_to_phoenix_by_one( connection=connection, data=area_data) once_count += success_count self.success_count += success_count # self.logger.info("HBase 插入成功, 成功条数 {} 条".format(success_count)) except Exception as e: # self.remove_id_list.remove(data_id) # self.copy_mongo_data_list.remove(copy_data) self.logger.exception( "HBase 插入 _id 为 {} 的数据失败, {}".format( data_id, e)) continue if self.success_count % 100 == 0: self.logger.info("HBase 插入成功, 成功条数 {} 条".format( self.success_count)) # 添加 {d:1} # if self.success_count % 50 == 0: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") mongo_data_list.close() # 添加 {d:1} # if self.remove_id_list: # update_count = m_client.update_to_mongodb(collection=collection, # data_id=self.remove_id_list, # data_dict={"d": 1}) # self.remove_id_list = [] # self.remove_count += update_count # self.logger.info("MongoDB 更新成功") if once_count > 0: status = True self.logger.info("HBase 插入成功, 成功条数 {}".format(once_count)) else: continue # 删除数据 # if status: # delete_count = self.delete_data_from_mongo(m_client=m_client, collection=collection, # entity_code=entity_code, # remove_id_list=self.remove_id_list) # self.remove_count += delete_count # self.logger.info("MongoDB 删除成功") # else: # self.logger.info("HBase 插入成功条数0条, 不执行删除") # # 将数据插入 spider_data_old 中 # if status: # try: # old_client.mongo_db = "spider_data_old" # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except pymongo.errors.ServerSelectionTimeoutError as e: # sleep(1) # self.logger.info("MongoDB 连接失败, 正在重新连接 {}".format(e)) # insert_count = old_client.all_to_mongodb(collection=collection_old, # insert_list=self.copy_mongo_data_list) # self.old_count += insert_count # # self.logger.info("MongoDB 插入成功, 成功条数 {}".format(insert_count)) # except Exception as e: # self.logger.info(e) # 关闭连接 m_client.client_close() p_client.close_client_phoenix(connection=connection) self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_HYBG_ITJZ_YHBG", mongo_collection="ZX_HYBG") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZHRMGHGSWJ_ZSHYFX", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
from crm_scripts import GenericScript from database._mongodb import MongoClient from tools.web_api_of_baidu import get_lat_lng, get_area def data_shuffle(data, ): re_data = dict() re_data['ACTIME_NAME_'] = data.get('TITLE_') re_data['RELEASE_DATE_'] = data.get('PUBLISH_TIME_') re_data['ACTIVE_DESC_HTML_'] = data.get('HTML_') re_data['ACTIVE_DESC_TEXT_'] = data.get('CONTENT_')[:501] re_data['DATA_SOURCE_NAME_'] = data.get('SOURCE_NAME_') re_data['DATA_SOURCE_URL_'] = data.get('URL_') re_data['AMOUNT_OF_READING_'] = data.get('READ_NUM_') re_data['ACTIVE_KEYWORDS_'] = data.get('') re_data['ACTIVE_OBJECT_'] = data.get('') re_data['BANK_NAME_'] = data.get('BANK_NAME_') return re_data if __name__ == '__main__': main_mongo = MongoClient(entity_code="CRMJPFX_YXHD_SHYH", mongo_collection="CRMJPFX_YXHD") data_list = main_mongo.main() for data in data_list[:2]: re_data = data_shuffle(data=data, ) print(re_data)
class Entrust(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSENTRUST_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="ENTRUST") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" def data_shuffle(self, data): re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["DEALTIME_"] = data["DEALTIME_"] re_data["URL_"] = data["URL_"] if data["ENTITY_CODE_"] == "CHINATRC": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] pub_date = eval(data["PUB_DATE_"]) date = str(pub_date["time"])[:-3] t = arrow.get(int(date)) publish_date = str(t)[:10] period_code = publish_date.replace("-", "") re_data["PERIOD_CODE_"] = period_code # re_data["REMARK_"] # re_data["UPDATE_TIME_"] re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] re_data["FUNCTION_"] = data["FUNCTION_"] pro_date = eval(data["PRO_START_"]) pro_date = str(pro_date["time"])[:-3] p_t = arrow.get(int(pro_date)) product_date = str(p_t)[:10] re_data["PRO_START_"] = product_date re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = publish_date # re_data["SCALE_"] = data[""] # re_data["MONTH_"] # re_data["YIELD_RATE_"] # re_data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] # re_data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] # re_data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTHEXUN": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # re_data["REMARK_"] # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"].replace( "至月", "") # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # re_data["PURPOSE_"] # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] # # re_data["DISTRIBU_MODE_"] # re_data["INVEST_AREA_"] # re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # re_data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] re_data["CURRENCY_"] = data["CURRENCY_"] re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] # re_data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "YANGLEE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] # re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] re_data["ENTRUST_STATUS_"] = data["STATUS_"] # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] # # re_data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] # # re_data["INVEST_DIRECTION_"] # re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] # re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] # re_data["RESERVE_INFO_"] = data["RESERVE_INFO_"] re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] re_data["OTHER_INFO_"] = data["OTHER_INFO_"] elif data["ENTITY_CODE_"] == "TRUSTONE": # "C" # re_data["AREA_CODE_"] # re_data["BANK_CODE_"] # re_data["BANK_NAME_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["PUB_DATE_"].replace("-", "") # # re_data["REMARK_"] # # re_data["UPDATE_TIME_"] # re_data["CODE_"] = data["CODE_"] re_data["NAME_"] = data["NAME_"] re_data["ISSUER_"] = data["ISSUER_"] # re_data["FUNCTION_"] = data["FUNCTION_"] # # re_data["PRO_START_"] # re_data["INVEST_PERIOD_"] = data["INVEST_PERIOD_"] # re_data["RUN_MODE_"] = data["RUN_MODE_"] # re_data["INDUSTRY_"] = data["INDUSTRY_"] re_data["PUB_DATE_"] = data["PUB_DATE_"] re_data["SCALE_"] = data["SCALE_"] # # re_data["MONTH_"] re_data["YIELD_RATE_"] = data["YIELD_RATE_"] re_data["START_FUNDS_"] = data["START_FUNDS_"] # # re_data["PURPOSE_"] # # re_data["ESTAB_ANNOUNCEMENT_"] # re_data["ENTRUST_STATUS_"] = data["STATUS_"] # # re_data["DISTRIBU_MODE_"] = data["DISTRIBU_MODE_"] re_data["INVEST_AREA_"] = data["INVEST_AREA_"] re_data["TERM_TYPE_"] = data["TERM_TYPE_"] re_data["INVEST_DIRECTION_"] = data["INVEST_DIRECTION_"] re_data["INVEST_MODE_"] = data["INVEST_MODE_"] # re_data["CURRENCY_"] = data["CURRENCY_"] # re_data["MANAGE_TYPE_"] = data["MANAGE_TYPE_"] # re_data["SALE_TARGET_"] = data["SALE_TARGET_"] re_data["PROFIT_TYPE_"] = data["PROFIT_TYPE_"] # re_data["ISSUER_AREA_"] = data["ISSUER_AREA_"] re_data["RESERVE_INFO_"] = re.sub(r"</?\w*>", "", data["RESERVE_INFO_"]) # re_data["TRUSTEESHIP_BANK_"] = data["TRUSTEESHIP_BANK_"] # re_data["OTHER_INFO_"] = data["OTHER_INFO_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "ENTRUST" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."CREATE_TIME_" varchar, "C"."STATUS_" varchar,' # '"C"."DEALTIME_" varchar, "C"."URL_" varchar, "C"."AREA_CODE_" varchar, "C"."FUNCTION_" varchar,' # '"C"."BANK_CODE_" varchar, "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar,' # '"C"."PERIOD_CODE_" varchar, "C"."REMARK_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."CODE_" varchar, "C"."NAME_" varchar, "C"."ISSUER_" varchar, "C"."PRO_START_" varchar,' # '"C"."INVEST_PERIOD_" varchar,"C"."RUN_MODE_" varchar, "C"."INDUSTRY_" varchar,' # '"C"."PUB_DATE_" varchar, "C"."SCALE_" varchar, "C"."MONTH_" varchar, "C"."YIELD_RATE_" varchar,' # '"C"."START_FUNDS_" varchar, "C"."PURPOSE_" varchar, "C"."ESTAB_ANNOUNCEMENT_" varchar,' # '"C"."ENTRUST_STATUS_" varchar, "C"."DISTRIBU_MODE_" varchar, "C"."INVEST_AREA_" varchar,' # '"C"."TERM_TYPE_" varchar, "C"."INVEST_DIRECTION_" varchar, "C"."INVEST_MODE_" varchar,' # '"C"."CURRENCY_" varchar, "C"."MANAGE_TYPE_" varchar, "C"."SALE_TARGET_" varchar,' # '"C"."PROFIT_TYPE_" varchar, "C"."ISSUER_AREA_" varchar, "C"."RESERVE_INFO_" varchar,' # '"C"."TRUSTEESHIP_BANK_" varchar, "C"."OTHER_INFO_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection, data_id="5c67307d9bb3df76b4229f79") for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) print(data["_id"]) # todo remove and upsert data from mongo # shuffle data # try: re_data = self.data_shuffle(data=data) # except Exception as e: # self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) # continue if re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZGZXQYXXW_XXJL", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_QCYJW_HGJJ", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- """21世纪经济网-商业 ZX_CJXW_ZYCJ_21SJJJW_SHY""" import re from database._mongodb import MongoClient def data_shuffle(data): data["HTML_"] = re.sub(r"<p class=\"copyright\".*?</p>", "", data["HTML_"]) data["HTML_"] = re.sub(r"<a.*?返回21经济首页.*?</a>", "", data["HTML_"]) return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_ZYCJ_21SJJJW_SHY", mongo_collection="ZX_CJXW_ZYCJ") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_ZGYJH_YJH", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGDZQYXH_HYTJ", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
import os from database._mongodb import MongoClient def data_shuffle(data): com_list = ["光大永明人寿保险公司", "太平人寿保险公司", "中信保诚人寿保险公司"] if data.get("PRO_NAME"): data["PRO_NAME_"] = data["PRO_NAME"] for com_name in com_list: if data["PRO_NAME_"][:2] == com_name[:2]: data["COM_NAME_"] = com_name curPath = os.path.abspath(os.path.dirname(__file__)) data["LOCAL_PDF_PATH_"] = "".join([ curPath, "/渤海代理保险合同条款/", data["PRO_NAME_"].replace("产品计划", ""), ".pdf" ]) data["LOCAL_PDF_NAME_"] = data["PRO_NAME_"] + "条款" if data.get("PDF_"): del data["PDF_"] return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="JRCP_BX_BHYH_GW_ALL", mongo_collection="JRCP_BX") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) # print(re_data)
"""中国民生银行 理财产品 JRCP_LCCP_ZGMSYH_GW_ALL""" from database._mongodb import MongoClient def data_shuffle(data): if "PRO_NAME_" not in data: return # 风险等级 if "SOURCE_RISK_LEVEL_CODE_" in data: if data["SOURCE_RISK_LEVEL_"] == "1": data["RISK_LEVEL_CODE_"] = "R1" elif data["SOURCE_RISK_LEVEL_"] == "2": data["RISK_LEVEL_CODE_"] = "R2" elif data["SOURCE_RISK_LEVEL_"] == "3": data["RISK_LEVEL_CODE_"] = "R3" elif data["SOURCE_RISK_LEVEL_"] == "4": data["RISK_LEVEL_CODE_"] = "R4" elif data["SOURCE_RISK_LEVEL_"] == "5": data["RISK_LEVEL_CODE_"] = "R5" if "START_FUNDS_" in data: data["START_FUNDS_"] = data["START_FUNDS_"].replace(",", "") return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="JRCP_LCCP_ZGMSYH_GW_ALL", mongo_collection="JRCP_LCCP") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
class JsInsuranceCcbData(object): def __init__(self): # 创建 MongoDB 对象 self.m_client = MongoClient(mongo_collection="JSINSURANCE_CCBDATA") db, collection_list = self.m_client.client_to_mongodb() self.collection = self.m_client.get_check_collection( db=db, collection_list=collection_list) # 创建 MySQL 对象 __mysql_config = { "host": MYSQL_HOST_25, "port": MYSQL_PORT_25, "database": MYSQL_DATABASE_25, "user": MYSQL_USER_25, "password": MYSQL_PASSWORD_25, "table": MYSQL_TABLE_25 } __mysql_client = MysqlClient(**__mysql_config) __mysql_connection = __mysql_client.client_to_mysql() self.type = __mysql_client.search_area_code( sql= "select DICT_CODE_,ITEM_LABEL_,ITEM_VALUE_ from sys_dict_item where DICT_CODE_=\'TYPE\'", connection=__mysql_connection) __mysql_client.close_client(connection=__mysql_connection) # 创建 Phoenix 对象 self.p_client = PhoenixHbase(table_name="INSURANCE") # 连接 Phoenix self.connection = self.p_client.connect_to_phoenix() self.logger = Logger().logger self.find_count = 0 self.success_count = 0 self.remove_count = 0 self.old_count = 0 self.bad_count = 0 self.error_count = 0 self.data_id = "" self.a = list() def data_shuffle(self, data): if data["ENTITY_CODE_"] == "PAINSURANCE": return None elif data["ENTITY_CODE_"] == "BJBINSURANCE": data["CONTET_"] = data["CONTET_"].replace("|主险2:", "主险2:") first_shuffle = data["CONTET_"].split("|") data_list = list() company_dict = dict() index_list = list() for first in first_shuffle: if first[-2:] == "公司": company_index = first_shuffle.index(first) company_dict[company_index] = first index_list.append(company_index) else: continue for key in index_list: # print(index_list) j = key + 1 for i in range(100): if index_list.index(key) == len(index_list) - 1: if j == len(first_shuffle) - 1: break else: if j == index_list[index_list.index(key) + 1]: break data_dict = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(first_shuffle[j].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" data_dict["ID_"] = row_key data_dict["ENTITY_CODE_"] = data["ENTITY_CODE_"] data_dict["ENTITY_NAME_"] = data["ENTITY_NAME_"].replace( "模板", "产品") data_dict["BANK_CODE_"] = "BJB" data_dict["BANK_NAME_"] = "北京银行" data_dict["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") data_dict["URL_"] = data["URL_"] data_dict["PRODUCT_NAME_"] = first_shuffle[j] j += 1 # data_dict["TYPE_"] = first_shuffle[j] data_dict["TYPE_"] = "" data_dict["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in first_shuffle[j]: data_dict["TYPE_"] = data_dict["TYPE_"] + i[ "ITEM_LABEL_"] + "|" data_dict["TYPE_CODE_"] = data_dict[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" data_dict["TYPE_"] = data_dict["TYPE_"][:-1] data_dict["TYPE_CODE_"] = data_dict["TYPE_CODE_"][:-1] j += 1 # data_dict["RISK_LEVEL_"] = first_shuffle[j] j += 1 data_dict["PAY_METHOD_"] = first_shuffle[j] j += 1 # data_dict["INSURANCE_DATE_"] = first_shuffle[j] j += 1 # data_dict["TOUZIZHE_TYPE_"] = first_shuffle[j] j += 1 data_dict["COM_NAME_"] = company_dict[key] # data_dict["CONSIGNMENT_"] = "代销" # if "CONTENT_" in data: # data_dict["CONTENT_"] = data["CONTENT_"] data_dict["DEALTIME_"] = data["DEALTIME_"] data_dict["CREATE_TIME_"] = data["DATETIME_"] data_dict["STATUS_"] = "1" # print(data_dict) data_list.append(data_dict) return data_list elif data["ENTITY_CODE_"] == "CIBINSURANCE": data_list = list() insurance_name = re.findall(r".*?计划", data["PRODUCT_NAME_"]) for name in insurance_name: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(name.encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["PRODUCT_NAME_"] = name re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = "CIB" re_data["BANK_NAME_"] = "兴业银行" re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] re_data["CREATE_TIME_"] = data["DATETIME_"] re_data["STATUS_"] = "1" re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in name: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"] + i[ "ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] data_list.append(re_data) return data_list else: if "INSURANCE_NAME_" not in data and ("PRODUCT_NAME_" not in data): return None else: if "INSURANCE_NAME_" in data: # # 承保年龄 # if ("INSURANCE_AGE_" not in data) or (not data["INSURANCE_AGE_"]): # age = re.findall(r"(\d*)周岁", data["INSURANCE_NAME_"]) # if age: # data["INSURANCE_AGE_"] = age[0] # 保障期限 # if ("INSURANCE_DATE_" not in data) or (not data["INSURANCE_DATE_"]): # limit = re.findall(r"保(终身)|保(\d*年)|(\d*年)期", data["INSURANCE_NAME_"]) # if limit: # for l in limit[0]: # if l: # data["INSURANCE_DATE_"] = l # break re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["INSURANCE_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] re_data["BANK_CODE_"] = data["ENTITY_CODE_"].replace( "INSURANCE", "") re_data["BANK_NAME_"] = data["ENTITY_NAME_"].replace( "保险产品", "") if "INSURANCE_NAME_" in data: re_data["PRODUCT_NAME_"] = data["INSURANCE_NAME_"] if ("INSURANCE_AGE_" in data) or ("AGE_" in data): re_data["AGE_"] = data["INSURANCE_AGE_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] else: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" for i in self.type: if i["ITEM_LABEL_"][:-1] in data["ENTITY_NAME_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] # if "INSURANCE_DATE_" in data: # re_data["INSURANCE_DATE_"] = data["INSURANCE_DATE_"] if "INSURANCE_DETAIL_" in data: re_data["PRODUCT_DETAIL_"] = data["INSURANCE_DETAIL_"] if "COMPANY_NAME_" in data: re_data["COM_NAME_"] = data["COMPANY_NAME_"] if "LIMIT_NUMBER_" in data: re_data["BUY_LIMIT_"] = data["LIMIT_NUMBER_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "CONTENT_" in data: re_data["CONTENT_"] = data["CONTENT_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data elif "PRODUCT_NAME_" in data: re_data = dict() # HBase row_key hash_m = hashlib.md5() hash_m.update(data["PRODUCT_NAME_"].encode("utf-8")) hash_title = hash_m.hexdigest() row_key = str(data["ENTITY_CODE_"]) + "_" + str(hash_title) # "C" re_data["ID_"] = row_key re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"] re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"] # re_data["BANK_CODE_"] = data["ENTITY_CODE_"] # re_data["BANK_NAME_"] = data["ENTITY_NAME_"] if "PRODUCT_NAME_" in data: re_data["PRODUCT_NAME_"] = data["PRODUCT_NAME_"] if "FEATURE_NAME_" in data: re_data["FEATURE_NAME_"] = data["FEATURE_NAME_"] if "TYPE_" in data: re_data["TYPE_"] = "" re_data["TYPE_CODE_"] = "" if data["TYPE_"] == "财险": re_data["TYPE_"] = "财产险" re_data["TYPE_CODE_"] = "PROPERTY_INSURANCE" elif data["TYPE_"] == "100种疾病保障": re_data["TYPE_"] = "健康险" re_data["TYPE_CODE_"] = "HEALTH_INSURANCE" else: for i in self.type: if i["ITEM_LABEL_"][:-1] in data["TYPE_"]: re_data["TYPE_"] = re_data["TYPE_"] + i[ "ITEM_LABEL_"] + "|" re_data["TYPE_CODE_"] = re_data[ "TYPE_CODE_"] + i["ITEM_VALUE_"] + "|" re_data["TYPE_"] = re_data["TYPE_"][:-1] re_data["TYPE_CODE_"] = re_data["TYPE_CODE_"][:-1] if "POLICY_DUTY_" in data: re_data["POLICY_DUTY_"] = data["POLICY_DUTY_"] if "PRODUCT_CASE_" in data: re_data["PRODUCT_CASE_"] = data["PRODUCT_CASE_"] if "BUY_LIMIT_" in data: re_data["BUY_LIMIT_"] = data["BUY_LIMIT_"] if "ENSURE_PRICE_" in data: re_data["ENSURE_PRICE_"] = data["ENSURE_PRICE_"] # re_data["AREA_CODE_"] # re_data["UNIT_CODE_"] re_data["PERIOD_CODE_"] = data["DATETIME_"][:10].replace( "-", "") if "PRODUCT_PRICE_" in data: re_data["PRODUCT_PRICE_"] = data["PRODUCT_PRICE_"] if "PRODUCT_ID_" in data: re_data["PRODUCT_ID_"] = data["PRODUCT_ID_"] if "PRODUCT_CLAUSE_" in data: re_data["PRODUCT_CLAUSE_"] = data["PRODUCT_CLAUSE_"] if "GENDER_" in data: re_data["GENDER_"] = data["GENDER_"] if "AGE_" in data: re_data["AGE_"] = data["AGE_"] if "COM_NAME_" in data: re_data["COM_NAME_"] = data["COM_NAME_"] if "PAY_METHOD_" in data: re_data["PAY_METHOD_"] = data["PAY_METHOD_"] if "PROBLEM_" in data: re_data["PROBLEM_"] = data["PROBLEM_"] if "CLAIM_" in data: re_data["CLAIM_"] = data["CLAIM_"] if "COMMENT_" in data: re_data["COMMENT_"] = data["COMMENT_"] if "ENSURE_CONTENT_" in data: re_data["ENSURE_CONTENT_"] = data["ENSURE_CONTENT_"] if "INSURE_INFO_" in data: re_data["INSURE_INFO_"] = data["INSURE_INFO_"] if "RATE_INFO_" in data: re_data["RATE_INFO_"] = data["RATE_INFO_"] if "SALE_SERVICE_" in data: re_data["SALE_SERVICE_"] = data["SALE_SERVICE_"] # re_data["NOTICE_TIME_"] = data["NOTICE_TIME_"] re_data["STATUS_"] = "1" # re_data["REMARK_"] = "" re_data["CREATE_TIME_"] = data["DATETIME_"] # re_data["UPDATE_TIME_"] # re_data["TITLE_"] = data["TITLE_"] re_data["URL_"] = data["URL_"] re_data["DEALTIME_"] = data["DEALTIME_"] # re_data["DATETIME_"] = data["DATETIME_"] return re_data def run(self): # # delete table # self.p_client.drop_table_phoenix(connection=self.connection) # # quit() # # # create table sql # table_sql = ('create table "INSURANCE" ("ID_" varchar primary key, "C"."ENTITY_CODE_" varchar,' # '"C"."ENTITY_NAME_" varchar, "C"."AREA_CODE_" varchar,"C"."BANK_CODE_" varchar,' # ' "C"."BANK_NAME_" varchar, "C"."UNIT_CODE_" varchar, "C"."PERIOD_CODE_" varchar, ' # '"C"."REMARK_" varchar, "C"."CREATE_TIME_" varchar, "C"."UPDATE_TIME_" varchar,' # '"C"."TYPE_" varchar, "C"."URL_" varchar, "C"."DEALTIME_" varchar, "C".PRODUCT_CLAUSE_ varchar,' # '"C"."SOURCE_" varchar, "C"."PRODUCT_NAME_" varchar, "C"."FEATURE_NAME_" varchar,' # '"C"."POLICY_DUTY_" varchar, "C"."PRODUCT_CASE_" varchar, "C"."BUY_LIMIT_" varchar,' # '"C"."ENSURE_PRICE_" varchar, "C"."PRODUCT_PRICE_" varchar, "C"."PRODUCT_ID_" varchar,' # '"C"."GENDER_" varchar, "C"."AGE_" varchar, "C"."COM_NAME_" varchar, "C"."TYPE_CODE_" varchar,' # '"C"."PAY_METHOD_" varchar, "C"."PRODUCT_DETAIL_" varchar, "C"."PROBLEM_" varchar,' # '"C"."CLAIM_" varchar, "C"."COMMENT_" varchar, "C"."STATUS_" varchar,' # '"C"."ENSURE_CONTENT_" varchar, "C"."INSURE_INFO_" varchar, "C"."RATE_INFO_" varchar,' # '"C"."SALE_SERVICE_" varchar) IMMUTABLE_ROWS = true') # # # create table # self.p_client.create_new_table_phoenix(connection=self.connection, sql=table_sql) mongo_data_list = self.m_client.all_from_mongodb( collection=self.collection) for i in range(mongo_data_list.count() + 100): try: data = mongo_data_list.__next__() except StopIteration: break except pymongo.errors.ServerSelectionTimeoutError as e: self.logger.info("MongoDB 超时, 正在重新连接, 错误信息 {}".format(e)) time.sleep(3) data = mongo_data_list.__next__() self.data_id = data["_id"] if self.success_count % 100 == 0: self.logger.info("正在进行 _id 为 {} 的数据".format(self.data_id)) # print(data["_id"]) # todo remove and upsert data from mongo # shuffle data try: re_data = self.data_shuffle(data=data) except Exception as e: self.logger.info("数据清洗失败 {}, id: {}".format(e, self.data_id)) continue if re_data: if isinstance(re_data, dict): # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=re_data) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue elif isinstance(re_data, list): for r_d in re_data: # upsert data to HBase try: success_count = self.p_client.upsert_to_phoenix_by_one( connection=self.connection, data=r_d) except jaydebeapi.DatabaseError as e: self.logger.info("错误 id: {}, 错误信息 {}".format( self.data_id, e)) continue # # add {d:1} # try: # self.m_client.update_to_mongodb(collection=self.collection, data_id=self.data_id, # data_dict={"d": 1}) # self.remove_count += 1 # if self.remove_count % 10 == 0: # self.logger.info("MongoDB 更新成功, 成功条数 {}".format(self.remove_count)) # except Exception as e: # self.logger.info("MongoDB 更新 _id 为 {} 的数据失败, {}".format(self.data_id, e)) # continue if success_count > 0: status = True self.success_count += success_count if self.success_count % 10 == 0: self.logger.info("HBase 插入成功 {} 条".format( self.success_count)) else: self.bad_count += 1 continue mongo_data_list.close() self.logger.info("本次共向 MongoDB 查取数据{}条".format(self.find_count)) self.logger.info("本次共向 HBase 插入数据{}条".format(self.success_count)) self.logger.info("本次共向 MongoDB 删除数据{}条".format(self.remove_count)) self.logger.info("本次共向 MongoDB 插入数据{}条".format(self.old_count)) self.logger.info("本次坏数据共 {} 条".format(self.bad_count)) self.logger.handlers.clear()
def data_shuffle(data): # 住宅名称 data['NAME_'] = data['NAME_'].replace('整租·', '') data['NAME_'] = data['NAME_'].replace('独栋·', '') # print(data['NAME_']) try: data['NAME_'] = data['NAME_'][:data['NAME_'].index('室') - 1] except: pass if "·" in data["NAME_"] or "·" in data["NAME_"]: house_name = re.findall(r"[\u4e00-\u9fa5]{2}[^\w]([\w()\-().,,]+)\|", data["NAME_"]) else: house_name = re.findall(r"\|([\w()\-().,,]+)\|", data["NAME_"]) # print(house_name) data["TITLE_"] = data["NAME_"] if house_name: data["NAME_"] = house_name[0] # print(house_name) return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="WD_JZ_FJ_LIXQZL_FS", mongo_collection="WD_JZ_FJ_FS") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- """南海农商银行 官网动态 ZX_GWDT_NHYH_NHXW""" import re from database._mongodb import MongoClient def data_shuffle(data): # if data["CONTENT_"]: # data["CONTENT_"] = re.sub(r"/\*[^\u4e00-\u9fa5]+", "", data["CONTENT_"], count=1) # if data["HTML_"]: # data["HTML_"] = re.sub("<p[^>]+align=\"center\">.*?</p>", "", data["HTML_"]) data["BANK_NAME_"] = "南海农商银行" data["BANK_CODE_"] = "NRCB" return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_GWDT_NHYH_NHXW", mongo_collection="ZX_GWDT") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_GJJRJG_QGZXQYGZXT_LWJTGGS", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_DYSYW_SYDC", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- """ 无 CONTENT_ """ from database._mongodb import MongoClient def data_shuffle(data): if "CONTENT_" not in data: data["CONTENT_"] = "" if "HTML_" not in data: data["HTML_"] = "" return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_ZCGG_SJS_ZJHL", mongo_collection="ZX_ZCGG") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- from database._mongodb import MongoClient def data_shuffle(data): return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_CJXW_HY_ZGJCXXZW_YDYL", mongo_collection="ZX_CJXW_HY") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)
# -*- coding: utf-8 -*- """"ZX_GWDT_GSYH_GHKX": "中国工商银行|ICBC",""" from database._mongodb import MongoClient def data_shuffle(data): data["BANK_NAME_"] = "中国工商银行" data["BANK_CODE_"] = "ICBC" return data if __name__ == '__main__': main_mongo = MongoClient(entity_code="ZX_GWDT_GSYH_GHKX", mongo_collection="ZX_GWDT") data_list = main_mongo.main() for data in data_list: re_data = data_shuffle(data) print(re_data)