def run_dim_cluster_main(syscode): conf = Config() logging.info("{}系统分析开始".format(syscode)) input_helper, output_helper = dynamic_import(conf) input_conn, output_conn = get_input_output_conn(conf) tables_schedule = output_helper.get_all_fk_tables(output_conn, conf.output_schema) filter_fks = output_helper.get_all_fk_id_in_detail(output_conn, conf.output_schema) tables = [tup for tup in tables_schedule if tup[0] == syscode] logging.info("分析表数量:{}".format(len(tables))) run_analyse(conf, input_conn, output_conn, tables, filter_fks) logging.info("{}系统分析结束".format(syscode)) close_odbc_connection(input_conn) close_db2_connection(output_conn)
def pk_main(conf, sys_code, ori_table_code, etl_date, date_offset, alg): """ 主键分析主入口 :param conf: 配置对象 :param sys_code: 系统编号 :param ori_table_code: 原始表编号 :param etl_date: 函数依赖分析取数时间,用于得到候选联合主键后进行校验 :param date_offset: 函数依赖分析取数偏移量,用于得到候选联合主键后进行校验 :param alg: 函数依赖分析算法,联合主键后进行校验 :return: """ assert isinstance(conf, Config) logging.info("{}表主键分析开始".format(ori_table_code)) input_conn, output_conn = get_input_output_conn(conf) analyse_table_pk(conf, input_conn, output_conn, sys_code, ori_table_code, etl_date, date_offset, alg) close_odbc_connection(input_conn) close_db2_connection(output_conn) logging.info("{}表主键分析结束".format(ori_table_code))
def pk_main_sql_processing(conf, sys_code, ori_table_code, alg, etl_date, date_offset, start_date_str): """ 单个进程进行task调起,用SQL语句查找主键 :param conf: 配置信息 :param sys_code: 系统编码 :param ori_table_code: 原始表编码 :param alg: 原始表卸数算法 :param etl_date: 卸数日期 :param date_offset: 日期偏移量 :param start_date_str: 主键分析开始时间 :return: """ input_conn, output_conn = get_input_output_conn(conf) logging.info("{}表主键分析开始,函数依赖关系小于{}条,使用SQL语句查找".format( ori_table_code, "10000")) analyse_table_pk_by_sql(conf, input_conn, output_conn, sys_code, ori_table_code, etl_date, date_offset, alg, start_date_str) logging.info("{}表主键分析结束".format(ori_table_code)) close_odbc_connection(input_conn) close_db2_connection(output_conn)
def analyse_table_fds_by_pandas(conf, sys_code, table_name, alg, etl_dates, start_date_str, fd_sample_size): logging.info("{}表使用pandas分析部分函数依赖关系".format(table_name)) import time st_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) assert isinstance(conf, Config) input_conn, output_conn = get_input_output_conn(conf) input_helper, output_helper = dynamic_import(conf) # 1. 数据采样 if alg == '': alg = output_helper.get_tab_alg_single(output_conn, conf.output_schema, sys_code, table_name) if alg == "F5": data, size, col_num = input_helper.get_cols_sample( input_conn, table_name, fd_sample_size, etl_dates[-1]) elif alg == "I": data, size, col_num = input_helper.get_cols_sample( input_conn, table_name, fd_sample_size, etl_dates) elif alg == "IU": trans_table_name = get_trans_table_name(output_conn, conf.output_schema, table_name) data, size, col_num = input_helper.get_cols_sample( input_conn, trans_table_name, fd_sample_size, etl_dates[-1]) else: logging.warning("{}表使用了未知算法{}".format(table_name, alg)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return '004' if size < conf.min_records: logging.warning("{}表数据过少!".format(table_name)) fds = [] output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '2') close_odbc_connection(input_conn) close_db2_connection(output_conn) return "001" df = pd.DataFrame(data) fds = analyse_table_mini_fds(df) ed_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logging.info('{}表开始函数依赖分析:{}'.format(table_name, st_time)) logging.info("{}表函数依赖计算正常完成:{}".format(table_name, ed_time)) output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '5') close_odbc_connection(input_conn) close_db2_connection(output_conn) return "000"
def analyse_joint_fk(conf, main_table_code, sub_sys_code_list, start_date_str=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())): """ 处理联合外键分析逻辑 支持:单系统内联合外键分析(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:S01) 单系统间联合外键分析(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:S02) 单系统和其他所有系统联合外键分析,包括自己(main_table_code:SO1中所有表做循环,针对该循环体做并发, sub_sys_code:All) 所有系统联合外键分析,包括自己(main_table_code:所有表做循环,针对该循环体做并发, sub_sys_code:All) :param conf: 配置对象 :param main_table_code: 主系统编号 :param sub_sys_code_list: 从系统编号列表 :param start_date_str: 单表外键分析开始时间 :return: """ assert isinstance(conf, Config) assert isinstance(sub_sys_code_list, list) inceptor_conn, output_conn = get_input_output_conn(conf) input_helper, output_helper = dynamic_import(conf) # 1、根据主系统编号查找已分析的联合主键 tables_pk = output_helper.get_tables_joint_pk(output_conn, conf.output_schema, main_table_code) # 2、遍历结果集,查找作为联合主键的每一个字段的字段特征,并根据字段特征在从系统编号中找到符合特征的字段 for sys_code, table_name in tables_pk: try: # 获取联合主键列表joint_pk for _, joint_pk in tables_pk[(sys_code, table_name)].items(): # 联合主键长度大于3的,或者小于等于1的不分析,记录日志 if len(joint_pk) > 3 or len(joint_pk) <= 1: joint_pk_str = " , ".join(pk for pk in joint_pk) logging.warning( "{}系统{}表的{}字段做联合主键,字段数目大于3或小于等于1,不能用于联合外键分析".format( sys_code, table_name, joint_pk_str)) continue init_capacity = 0 # 用于存放待检查的外键字段字典 all_check_fk_cols = {} double_or_time_flg = False # 遍历联合主键中的每一个字段 for col in joint_pk: table_schema = sys_code # 查询联合主键中的每一个字段的字段特征 pk_feature = output_helper.get_col_info_feature( output_conn, sys_code, table_schema, table_name, col, conf.output_schema) # TODO 如果联合主键中有字段的数据类型是Double、TIMESTAMP、DATE、TIME,则该联合主键不能进行联合外键分析 if pk_feature["COL_TYPE"].rstrip() == 'DOUBLE' or pk_feature["COL_TYPE"].rstrip() == 'TIMESTAMP'\ or pk_feature["COL_TYPE"].rstrip() == 'DATE' or pk_feature["COL_TYPE"].rstrip() == 'TIME': double_or_time_flg = True # bloom过滤器初始化容量 init_capacity = int(pk_feature["COL_RECORDS"]) # TODO 在sub_sys_code中找符合主键特征的所有字段,排除掉为空的联合外键字段 check_fk_cols = output_helper. \ get_check_fk_col(output_conn, pk_feature, conf.output_schema, sub_sys_code_list, distinct_limit=True, nullable=False) # 用于存放单个待检查外键字段,key为(fk_sys_code, fk_table_schema, fk_table_name),value为候选外键字段名,主键字段名 check_fk_cols_dict = {} # 遍历符合主键特征的字段,获取SYS_CODE, TABLE_SCHEMA, TABLE_CODE, COL_CODE for check_dict in check_fk_cols: fk_sys_code = check_dict['SYS_CODE'] fk_table_schema = check_dict['TABLE_SCHEMA'] fk_table_name = check_dict['TABLE_CODE'] fk_col_name = check_dict['COL_CODE'] if (fk_sys_code, fk_table_schema, fk_table_name) not in check_fk_cols_dict: check_fk_cols_dict[(fk_sys_code, fk_table_schema, fk_table_name)] = [] # key:(fk_sys_code, fk_table_schema, fk_table_name),value[(fk_col_name, col)] check_fk_cols_dict[(fk_sys_code, fk_table_schema, fk_table_name)].append( (fk_col_name, col)) all_check_fk_cols[col] = check_fk_cols_dict check_fk_values_list = list(all_check_fk_cols.values()) # 3、在符合特征的字段中取交集,交集字段所在的表即为联合主键可能出现并作为联合外键的表,即checks_tables checks_tables = set( check_fk_values_list[0].keys()).intersection( set(check_fk_values_list[1].keys())) # 如果联合主键数目多于2,进行下面的处理 if len(check_fk_values_list) > 2: for i in range(2, len(check_fk_values_list)): checks_tables = set( check_fk_values_list[i].keys()).intersection( checks_tables) # 如果联合主键中有字段的数据类型是Double和TIMESTAMP,则该联合主键不能进行联合外键分析 if double_or_time_flg: continue # 多个符合联合主键特征的字段出现在不同的表中,没有交集,也无法进行联合外键分析 if not checks_tables: continue logging.info("主键:{}表{}字段, 待检查外键所在表个数:{}".format( table_name, joint_pk, len(checks_tables))) # 4、生成bloom过滤器,从ods中根据联合主键所在表名,字段,取数时间,取数偏移量,函数依赖分析算法拉取具体的字段值放入bloom过滤器 capacity = init_capacity + conf.bloom_init_capacity # 获取联合主键所在表的卸数算法 table_alg = output_helper.get_tab_alg_single( output_conn, conf.output_schema, sys_code, table_name) # 获取联合主键所在表的数据日期和日期偏移量 etl_dates = None etl_date, date_offset = output_helper. \ get_tab_date_offset_single(output_conn, conf.output_schema, sys_code, table_name) if etl_date and date_offset: etl_dates = date_trans(etl_date, date_offset) else: logging.error( "{}表存在联合主键,但未获取到卸数日期和日期偏移量,无法继续进行联合外键分析".format( table_name)) exit(-1) cursor = None if table_alg == "F5": cursor = input_helper.get_mul_col_cursor( inceptor_conn, table_name, joint_pk, etl_dates[-1]) elif table_alg == "I": cursor = input_helper.get_mul_col_cursor( inceptor_conn, table_name, joint_pk, etl_dates) elif table_alg == "IU": trans_table_code = output_helper.get_trans_table_name( output_conn, conf.output_schema, table_name) cursor = input_helper.get_mul_col_cursor( inceptor_conn, trans_table_code, joint_pk, etl_dates[-1]) else: logging.error("{}表使用了不支持卸数方式{},在联合外键分析时无法获得联合主键的值".format( table_name, table_alg)) close_db2_connection(output_conn) close_odbc_connection(inceptor_conn) exit(-1) # 将联合主键的值放入bloom过滤器 bloom = generate_mul_col_bloom(conf, capacity, cursor) # 5、遍历外键组合 joint_fks = [] for fk_sys_code, fk_table_schema, fk_table_name in checks_tables: # 可能会出现在查找all->all或者S01->S01的时候,查找符合联合主键特征的联合外键的时候正好找到了联合主键所在的表,要把这种情况排除掉 if fk_sys_code == sys_code and fk_table_name == table_name: continue lists = [] # 遍历待检查的字段,将[(fk1,pk1),(fk2,pk2),...]放入list for col, check_dict in all_check_fk_cols.items(): lists.append(check_dict[(fk_sys_code, fk_table_schema, fk_table_name)]) # 对符合特征的字段做外键的排列组合 check_lists = comb_lists(lists) # check_tuple:((fk1,pk1),(fk2,pk2)) for check_tuple in check_lists: # check_cols:[fk1,fk2] pk_to_fk_dict = {p: f for f, p in check_tuple} check_cols = [pk_to_fk_dict[p] for p in joint_pk] # 防止出现[fk1,fk1]这样的情况 if len(set(check_cols)) != len(check_cols): continue # 获取候选联合外键所在表的卸数算法 fk_table_alg = output_helper. \ get_tab_alg_single(output_conn, conf.output_schema, fk_sys_code, fk_table_name) # 获取联合外键所在表的数据日期和日期偏移量 fk_etl_dates = None fk_tb_etl_date, fk_tb_date_offset = output_helper. \ get_tab_date_offset_single(output_conn, conf.output_schema, fk_sys_code, fk_table_name) if fk_tb_etl_date and fk_tb_date_offset: fk_etl_dates = date_trans(fk_tb_etl_date, fk_tb_date_offset) else: logging.error( "{}表存在候选联合外键,但未获取到卸数日期和日期偏移量,无法继续进行联合外键分析". format(fk_table_name)) close_db2_connection(output_conn) close_odbc_connection(inceptor_conn) exit(-1) # 从ods中根据联合外键所在表名,字段,取数时间,取数偏移量,函数依赖分析算法拉取具体的字段值 fk_cursor = None if fk_table_alg == "F5": fk_cursor = input_helper. \ get_mul_col_not_null_cursor(inceptor_conn, fk_table_name, check_cols, fk_etl_dates[-1]) elif fk_table_alg == "I": fk_cursor = input_helper. \ get_mul_col_not_null_cursor(inceptor_conn, fk_table_name, check_cols, fk_etl_dates) elif fk_table_alg == "IU": fk_trans_table_code = output_helper. \ get_trans_table_name(output_conn, conf.output_schema, fk_table_name) fk_cursor = input_helper. \ get_mul_col_not_null_cursor(inceptor_conn, fk_trans_table_code, check_cols, fk_etl_dates[-1]) else: logging.error( "在进行联合外键分析时,对候选联合外键所在表{}进行取数时,发现该表使用了不支持卸数方式{}" .format(fk_table_name, fk_table_alg)) close_db2_connection(output_conn) close_odbc_connection(inceptor_conn) exit(-1) # 和bloom过滤器中的值进行对比,得到联合外键的值在布隆过滤器中的占比 p = get_contains_percent_from_cursor(bloom, fk_cursor) # 外键比例阈值,当外键数据比例达到该值认为是外键关系 thr = conf.fk_check_threshold # 主键数据量少于该阈值认为主键所在表只有少量数据 if len(bloom) < conf.fk_little_data: # 少量数据外键比例阈值 thr = conf.fk_little_data_threshold # 联合外键的值在布隆过滤器中的占比大于等于阈值 if p >= thr: tmp_joint_fk = [] for elem in check_tuple: # tuple含义:即(sys_code)系统的(table_name)表的(elem[1])字段,在(fk_sys_code)系统的(fk_table_name)表的(elem[0])做外键 tmp_joint_fk.append( ((sys_code, table_name, elem[1]), (fk_sys_code, fk_table_name, elem[0]))) joint_fks.append(tmp_joint_fk) # 6、分析结果保存到数据库 if joint_fks: res_code = output_helper.save_joint_fk_info( output_conn, joint_fks, conf.output_schema, main_table_code, start_date_str) if res_code == -1: sub_sys_str = " , ".join( sub_sys for sub_sys in sub_sys_code_list) logging.error( "以{}为主表,{}为从系统进行联合外键分析,找到了联合外键,结果保存数据库失败".format( main_table_code, sub_sys_str)) close_db2_connection(output_conn) close_odbc_connection(inceptor_conn) return else: res_code = output_helper.update_unfound_joint_fk_sche( output_conn, conf.output_schema, main_table_code, start_date_str) if res_code == -1: sub_sys_str = " , ".join( sub_sys for sub_sys in sub_sys_code_list) logging.error( "以{}为主表,{}为从系统进行联合外键分析,未能找到联合外键,更新进度表失败".format( main_table_code, sub_sys_str)) close_db2_connection(output_conn) close_odbc_connection(inceptor_conn) return logging.warning("多个符合{}表联合主键特征的字段出现在不同的表中,没有交集,找不到联合外键".format( main_table_code)) no_intersection_res_code = output_helper. \ update_unfound_joint_fk_sche(output_conn, conf.output_schema, main_table_code, start_date_str) if no_intersection_res_code == -1: sub_sys_str = " , ".join(sub_sys for sub_sys in sub_sys_code_list) logging.error("以{}为主表,{}为从系统进行联合外键分析,未能找到联合外键,更新进度表失败".format( main_table_code, sub_sys_str)) except Exception as ex: logging.warning(str(ex)) # 关闭数据库连接 close_db2_connection(output_conn) close_odbc_connection(inceptor_conn)
def analyse_table_fk(conf, sys_code, table_code, sys_fk, etl_dates, pk_alg, start_date_str): """ 单一外键分析逻辑 :param conf: 配置信息 :param sys_code: 主键所在表系统编号 :param table_code: 主键所在表表编码 :param sys_fk: 用于寻找外键的从系统 :param etl_dates: 主键所在表卸数日期 :param pk_alg: 主键所在表卸数算法 :param start_date_str: 外键分析开始时间 :return: """ assert isinstance(conf, Config) schema = conf.output_schema input_conn, output_conn = get_input_output_conn(conf) input_helper, output_helper = dynamic_import(conf) # 查找主键 try: pk_list = output_helper.get_tables_pk(output_conn, schema, table_code) except Exception as e: logging.error("主键查找异常:{}!{}".format(table_code, e)) close_db2_connection(output_conn) close_odbc_connection(input_conn) return '001' fk_dict = {} if len(pk_list) == 0: logging.warning('{}表无单一主键'.format(table_code)) output_helper.save_fk_info(output_conn, fk_dict, conf.output_schema, sys_code, table_code, start_date_str, '3') close_db2_connection(output_conn) close_odbc_connection(input_conn) return '002' table_schema = sys_code pk_disqlt = [] for pk in pk_list: logging.info('正在查找主键{}的外键'.format(table_code + '_' + pk)) try: # 查找主键特征 pk_feature = output_helper.get_col_info_feature( output_conn, sys_code, sys_code, table_code, pk, schema) # 自增序列跳过 if pk_feature["COL_AUTOINCRE"] == '1': pk_disqlt.append(pk) continue bloom_path = os.path.join( conf.bloom_path, "{}_{}_{}".format(sys_code, table_code, pk)) except Exception as e: logging.error("查找主键特征失败:{}_{}:{}!".format(table_code, pk, str(e))) close_db2_connection(output_conn) close_odbc_connection(input_conn) return '003' try: if os.path.exists(bloom_path): # 从本地缓存加载 bloom = ScalableBloomFilter.fromfile(f=open(bloom_path, 'rb')) else: # 生成主键布隆过滤器 if pk_alg == "": pk_alg = output_helper.get_tab_alg_single( output_conn, schema, sys_code, table_code) if pk_alg == "F5": etl_date = etl_dates[-1] elif pk_alg == "I": etl_date = etl_dates elif pk_alg == "IU": trans_table_name = output_helper.get_trans_table_name( output_conn, schema, table_code) table_code = trans_table_name etl_date = etl_dates[-1] else: logging.warning("{}表使用了未知算法{}".format(table_code, pk_alg)) close_db2_connection(output_conn) close_odbc_connection(input_conn) return '004' capacity = conf.bloom_init_capacity + int( pk_feature["COL_DISTINCT"]) bloom = generate_bloom( conf, capacity, input_helper.get_col_cursor(input_conn, table_code, pk, etl_date)) # 缓存布隆过滤器,再次使用从本地加载 bloom.tofile(f=open(bloom_path, 'wb')) except Exception as e: logging.error("生成布隆过滤器异常:{}_{}:{}!".format(table_code, pk, e)) close_db2_connection(output_conn) close_odbc_connection(input_conn) return '005' try: # 主键长度太短可能与状态码字段错误关联,跳过分析 if pk_feature['AVG_LEN'] and float(pk_feature['AVG_LEN']) < 2: pk_disqlt.append(pk) continue # 查找符合特征的待检查外键字段 check_fk_cols = output_helper.get_check_fk_col( output_conn, pk_feature, schema, sys_fk) del pk_feature # 转换格式,减少查找数据次数 check_fk_cols_dict = {} for check_dict in check_fk_cols: fk_sys_code = check_dict['SYS_CODE'] fk_table_schema = check_dict['TABLE_SCHEMA'] fk_table_code = check_dict['TABLE_CODE'] fk_col_code = check_dict['COL_CODE'] # 不在待检查系统内的外键跳过,同一张表的字段跳过 if fk_sys_code not in sys_fk or fk_table_code == table_code: continue if (fk_sys_code, fk_table_schema, fk_table_code) not in check_fk_cols_dict: check_fk_cols_dict[(fk_sys_code, fk_table_schema, fk_table_code)] = [] check_fk_cols_dict[(fk_sys_code, fk_table_schema, fk_table_code)].append(fk_col_code) # 遍历待检查外键 for fk_sys_code, fk_table_schema, fk_table_code in check_fk_cols_dict: # fk_alg是外键的卸数算法 fk_alg = output_helper.get_tab_alg_single( output_conn, schema, fk_sys_code, fk_table_code) if fk_alg == "F5": fk_etl_date = etl_dates[-1] elif fk_alg == "I": fk_etl_date = etl_dates elif fk_alg == "IU": trans_table_name = output_helper.get_trans_table_name( output_conn, schema, fk_table_code) fk_table_code = trans_table_name fk_etl_date = etl_dates[-1] else: logging.warning("{}表使用了未知算法{}".format(table_code, fk_alg)) close_db2_connection(output_conn) close_odbc_connection(input_conn) return '006' fk_check_cols = check_fk_cols_dict[(fk_sys_code, fk_table_schema, fk_table_code)] check_mul_limit_data = input_helper.\ get_mul_col_sample(input_conn, fk_table_code, fk_check_cols, 500, fk_etl_date) for check_col in fk_check_cols: check_limit_data, blank_count = remove_blank( check_mul_limit_data[check_col]) # 为了防止哈希碰撞,如果取set集合之后只有一个值,那么在bloom过滤器中出现的概率就是100% if len(set(check_limit_data)) < 3: continue p, _ = get_contains_percent(bloom, check_limit_data) thr = conf.fk_check_threshold if len(bloom) < conf.fk_little_data: thr = conf.fk_little_data_threshold if p >= thr: ckeck_all_data_cursor = input_helper.\ get_col_cursor(input_conn, fk_table_code, check_col, fk_etl_date, True) p, f = get_contains_percent_from_cursor( bloom, ckeck_all_data_cursor) if p >= thr: if (fk_sys_code, fk_table_schema, fk_table_code, check_col) not in fk_dict: fk_dict[(fk_sys_code, fk_table_schema, fk_table_code, check_col)] = [] fk_dict[(fk_sys_code, fk_table_schema, fk_table_code, check_col)].append( (sys_code, table_schema, table_code, pk, len(bloom), p, f)) except Exception as e: logging.error("外键分析异常:{}:{}:{}!".format(table_code, pk, e)) output_helper.save_fk_info(output_conn, fk_dict, conf.output_schema, sys_code, table_code, start_date_str, '5') close_db2_connection(output_conn) close_odbc_connection(input_conn) return '007' # 存储到数据库 if len(fk_dict) != 0: return_code = output_helper.\ save_fk_info(output_conn, fk_dict, conf.output_schema, sys_code, table_code, start_date_str, '1') if return_code == 0: logging.info('{}表外键分析完成,找到外键,并成功保存'.format(table_code)) elif return_code == -1: logging.error('{}表外键分析完成,找到外键,成功失败'.format(table_code)) else: logging.error('{}表外键分析完成,找到外键,保存数据库过程中返回未知状态码{}'.format( table_code, return_code)) else: if len(pk_list) == len(pk_disqlt): output_helper.\ save_fk_info(output_conn, fk_dict, conf.output_schema, sys_code, table_code, start_date_str, '4') logging.warning('{}表主键为自增序列或平均长度过短'.format(table_code)) else: output_helper.save_fk_info(output_conn, fk_dict, conf.output_schema, sys_code, table_code, start_date_str, '2') logging.warning('{}表无外键关系'.format(table_code)) close_db2_connection(output_conn) close_odbc_connection(input_conn)
import logging from configuration import Config from utils.log_util import init_log from utils.common_util import dynamic_import from helper.same_cluster_helper import run_cluster from dao import close_db2_connection, close_odbc_connection, get_input_output_conn if __name__ == '__main__': init_log(log_path='../logs/same_cluster', level=logging.DEBUG) conf = Config() input_helper, output_helper = dynamic_import(conf) input_conn, output_conn = get_input_output_conn(conf) run_cluster(conf, output_conn) close_odbc_connection(input_conn) close_db2_connection(output_conn)
def analyse_table_fds_by_spark(conf, sys_code, table_name, alg, etl_dates, start_date_str, fd_sample_size): logging.info("{}表使用spark分析{}函数依赖关系".format(table_name, fd_sample_size)) import time st_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) assert isinstance(conf, Config) input_conn, output_conn = get_input_output_conn(conf) input_helper, output_helper = dynamic_import(conf) # 从hive上拉取数据,卸数为csv文件,csv文件的存放路径 tmp_csv_file = os.path.abspath( os.path.join(conf.fd_tmp_path, "{}.tmp".format(table_name))) # 分析结果路径 tmp_res_path = os.path.abspath(os.path.join(conf.fd_tmp_path, table_name)).replace( "\\", "/") # 拼接HDFS路径 hdfs_tmp_csv_file = "/tmp/fd/%s.tmp" % table_name hdfs_tmp_res_path = "/tmp/fd/%s" % table_name logging.info("开始函数依赖分析:{}!".format(table_name)) if not os.path.exists(tmp_res_path): # 1. 数据采样 try: if alg == '': alg = output_helper.get_tab_alg_single(output_conn, conf.output_schema, sys_code, table_name) if alg == "F5": data, size, col_num = input_helper.get_cols_sample( input_conn, table_name, fd_sample_size, etl_dates[-1]) elif alg == "I": data, size, col_num = input_helper.get_cols_sample( input_conn, table_name, fd_sample_size, etl_dates) elif alg == "IU": trans_table_name = get_trans_table_name( output_conn, conf.output_schema, table_name) data, size, col_num = input_helper.get_cols_sample( input_conn, trans_table_name, fd_sample_size, etl_dates[-1]) else: logging.warning("{}表使用了未知算法{}".format(table_name, alg)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return '004' except Exception as e: logging.error("{}表进行函数依赖分析数据采集时发生异常{}".format(table_name, e)) if size < conf.min_records: logging.warning("{}表数据过少!".format(table_name)) fds = [] output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '2') close_odbc_connection(input_conn) close_db2_connection(output_conn) return "001" df = pd.DataFrame(data) # df.to_csv(tmp_csv_file, encoding='utf-8', sep='$', index=False) df.to_parquet(tmp_csv_file, compression='UNCOMPRESSED') del df if conf.spark_mode == 'yarn': cmd_hdfs = "hdfs dfs -put -f %s %s" % (tmp_csv_file, hdfs_tmp_csv_file) execute_command(cmd_hdfs) cmd_rm = "hdfs dfs -rm -r -f %s" % hdfs_tmp_res_path execute_command(cmd_rm) # cmd = "spark-submit --master yarn --deploy-mode client " + \ # "--driver-memory 4G --num-executors 12 --executor-cores 2 --executor-memory 3G " + \ # "--conf spark.default.parallelism=50 --conf spark.storage.memoryFraction=0.4 " + \ # "--conf spark.sql.shuffle.partitions=50 --conf spark.shuffle.memoryFraction=0.5 " + \ # "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \ # "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \ # "--outputFilePath {} ".format(hdfs_tmp_res_path) + \ # "--inputFileHasHeader true " + \ # "--inputFileSeparator $" # cmd = "spark-submit --master yarn --deploy-mode client " + \ # "--driver-memory 16G --num-executors 6 --executor-cores 2 --executor-memory 10G " + \ # "--conf spark.default.parallelism=50 --conf spark.storage.memoryFraction=0.4 " + \ # "--conf spark.sql.shuffle.partitions=50 --conf spark.shuffle.memoryFraction=0.5 " + \ # "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \ # "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \ # "--outputFilePath {} ".format(hdfs_tmp_res_path) + \ # "--inputFileHasHeader true " + \ # "--inputFileSeparator $" cmd = "spark-submit --master yarn --deploy-mode cluster " + \ "--driver-memory 20G --executor-cores 8 --executor-memory 20G --num-executors 3 " + \ "--conf spark.driver.maxResultSize=20G --conf spark.storage.memoryFraction=0.4 " + \ "--conf spark.shuffle.memoryFraction=0.5 --conf spark.shuffle.spill.compress=true " + \ "--conf spark.kryoserializer.buffer.max=128m --name FD_{} ".format(table_name) + \ "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_hdfs_jar_path) + \ "--inputFilePath {} ".format(hdfs_tmp_csv_file) + \ "--outputFilePath {} ".format(hdfs_tmp_res_path) + \ "--inputFileHasHeader true " + \ "--inputFileSeparator $ " + \ "--useParquet true" else: cmd = "spark-submit --master local[*] " + \ "--class com.bigdata.hyshf.main.Main {} ".format(conf.fd_jar_path) + \ "--inputFilePath file://{} ".format(tmp_csv_file) + \ "--outputFilePath file://{} ".format(os.path.abspath(tmp_res_path)) + \ "--inputFileHasHeader true " + \ "--inputFileSeparator $" + \ "--useParquet true" timeout = 60 * 60 res_int = execute_command(cmd) # res_int = execute_command(cmd, timeout=timeout) logging.debug("spark执行返回代码:{}".format(res_int)) else: res_int = 0 if res_int == 0 and conf.spark_mode == 'yarn': # logging.info("{}表spark程序完成".format(table_name)) if os.path.exists(tmp_res_path + "/part-00000"): os.remove(tmp_res_path + "/part-00000") os.rmdir(tmp_res_path) cmd_hdfs = "hdfs dfs -get %s %s" % (hdfs_tmp_res_path, tmp_res_path) hdfs_to_lcoal_res = execute_command(cmd_hdfs) if hdfs_to_lcoal_res != 0: logging.error("{}表函数依赖关系分析完毕,将结果从hdfs拉取至本地时失败".format(table_name)) return if res_int == 0: # 问题修复:可能没有符合条件的函数依赖 try: fds = parse_result(tmp_res_path + "/part-00000") output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '1') except Exception as e: logging.error("{}表函数依赖未正常保存:{}".format(table_name, e)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return "005" ed_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logging.info('{}表开始函数依赖分析:{}'.format(table_name, st_time)) logging.info("{}表函数依赖计算正常完成:{}".format(table_name, ed_time)) try: # 删除临时文件 if os.path.exists(tmp_res_path + "/part-00000"): os.remove(tmp_res_path + "/part-00000") if os.path.exists(tmp_res_path): os.rmdir(tmp_res_path) except Exception as e: logging.error("{}表临时文件删除失败:{}".format(table_name, e)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return "006" close_odbc_connection(input_conn) close_db2_connection(output_conn) return "000" elif res_int == -1: fds = [] output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '3') logging.warning("{}表函数依赖计算超时".format(table_name)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return "002" else: fds = [] output_helper.save_table_fd(output_conn, sys_code, table_name, fds, conf.output_schema, start_date_str, '4') logging.error("{}表函数依赖计算发生异常".format(table_name)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return "003"
def analyse_table_feature(conf, sys_code, table_code, alg, etl_dates, start_date_str=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())): """ 按表分析字段特征 :param conf: 配置信息对象 :param sys_code: 系统编码 :param table_code: 表编码 :param alg: 数据来源表卸数方法 :param etl_dates: 数据来源表卸数时间 :param start_date_str: 单表字段分析开始时间 :return: """ assert isinstance(conf, Config) assert isinstance(etl_dates, list) input_conn, output_conn = get_input_output_conn(conf) input_helper, output_helper = dynamic_import(conf) # 用保存字段特征 features = {} # 用于保存代码类字段的码值信息 code_value_dict = {} size, data, col_num, distinct_col_count, count, distinct, max_len, min_len = \ None, None, None, None, None, None, None, None # 1. 数据采样,并计算表记录数 try: if alg == "F5": data, size, col_num = input_helper.\ get_cols_sample(input_conn, table_code, conf.feature_sample_size, etl_dates[-1]) count = input_helper.get_count(input_conn, table_code, etl_dates[-1]) elif alg == "I": data, size, col_num = input_helper.get_cols_sample(input_conn, table_code, conf.feature_sample_size, etl_dates) count = input_helper.get_count(input_conn, table_code, etl_dates) elif alg == "IU": trans_table_code = get_trans_table_name(output_conn, conf.output_schema, table_code) data, size, col_num = input_helper.\ get_cols_sample(input_conn, trans_table_code, conf.feature_sample_size, etl_dates[-1]) count = input_helper.get_count(input_conn, trans_table_code, etl_dates[-1]) else: logging.error("{}表使用了不支持卸数方式{}".format(table_code, alg)) close_odbc_connection(input_conn) close_db2_connection(output_conn) exit(-1) except Exception as e: logging.error("{}表字段特征分析采样阶段出现异常{}".format(table_code, e)) # 如果采样量小于字段特征分析阈值,记录日志 if size < conf.min_records: logging.warning("{}表实际数据采样量{}小于字段特征分析的阈值{}".format(table_code, size, conf.min_records)) # 因采样量小于字段特征分析阈值,将进度表更新为2 res_code = output_helper.update_unana_feature_sche(output_conn, conf.output_schema, table_code, start_date_str) if res_code != 0: logging.error("{}表实际数据采样量小于字段特征分析的阈值,更新进度表失败".format(table_code)) close_odbc_connection(input_conn) close_db2_connection(output_conn) return logging.info("开始分析{}表字段特征".format(table_code)) # 遍历表中的每一个字段 for col_name, col_data in data.items(): # 字段值检查 if not isinstance(col_data[0], str): logging.warning("{}表{}字段不是字符串类型,无法进行特征分析".format(table_code, col_name)) continue feature = Feature() # 2) 字段值去重记录数分析,字段值最大长度和最小长度分析,计算字段值是否是默认值 if alg == "F5": distinct = input_helper.get_distinct_count(input_conn, table_code, col_name, etl_dates[-1]) min_len, max_len = input_helper.get_min_max_length(input_conn, table_code, col_name, etl_dates[-1]) distinct_col_count = input_helper.get_distinct_col_count(input_conn, table_code, col_name, etl_dates[-1]) elif alg == "I": distinct = input_helper.get_distinct_count(input_conn, table_code, col_name, etl_dates) min_len, max_len = input_helper.get_min_max_length(input_conn, table_code, col_name, etl_dates) distinct_col_count = input_helper.get_distinct_col_count(input_conn, table_code, col_name, etl_dates) elif alg == "IU": trans_table_code = get_trans_table_name(output_conn, conf.output_schema, table_code) distinct = input_helper.\ get_distinct_count(input_conn, trans_table_code, col_name, etl_dates[-1]) min_len, max_len = input_helper.get_min_max_length(input_conn, trans_table_code, col_name, etl_dates[-1]) distinct_col_count = input_helper.\ get_distinct_col_count(input_conn, trans_table_code, col_name, etl_dates[-1]) else: logging.error("{}表使用了不支持卸数方式{}".format(table_code, alg)) close_odbc_connection(input_conn) close_db2_connection(output_conn) exit(-1) if int(distinct_col_count) == 1: feature.default_value = True feature.records = count feature.distinct = distinct feature.max_len = max_len feature.min_len = min_len # 5)从字段值的角度进行字段特征分析 feature, code_value_set = \ infer_feature(conf, col_name, col_data, input_conn, table_code, alg, output_conn, etl_dates, feature=feature) # 判断字段是否是代码类,如果是代码类将码值保存到code_value_dict中 if code_value_set: code_value_dict[col_name] = code_value_set features[col_name] = feature # 3. 保存数据 stat = output_helper.\ save_table_features(output_conn, sys_code, sys_code, table_code, features, conf.output_schema, start_date_str, col_num, code_value_dict) if stat != 0: logging.error("{}表分析结果保存数据库失败".format(table_code)) logging.info("{}表字段特征分析结束".format(table_code)) # 关闭数据库连接 close_odbc_connection(input_conn) close_db2_connection(output_conn)