def fd_main(sys_code, tab_code, etl_date, date_offset, alg, sample_size, start_date_str): etl_dates = date_trans(etl_date, date_offset) conf = Config() output_conn = None if conf.output_db == "db2": output_conn = get_db2_connect(conf.output_db_url) else: logging.error("输出配置数据库未适配 :{}".format(conf.output_db)) exit(-1) # 检查输出,已分析的表跳过分析步骤 # 函数依赖分析 fd_sche = get_analysis_schedule_single(output_conn, conf.output_schema, sys_code, tab_code)['FD_SCHE'] ibm_db.close(output_conn) if fd_sche == "1": logging.warning("该表已完成函数依赖分析:{}".format(tab_code)) exit(-1) else: analyse_table_fds(conf, sys_code, tab_code, alg, etl_dates, start_date_str, sample_size, status=fd_sche)
from utils.common_util import * import time import multiprocessing from main.fk_main import analyse_table_fk import ibm_db from dao.output.db2_helper import get_fk_sys init_log('../logs/fk', level=logging.DEBUG) if __name__ == "__main__": conf = Config() start_date_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) inceptor_conn = get_odbc_connect(conf.dsn) # 检查输出,已分析的表跳过分析步骤 output_conn = None if conf.output_db == "db2": output_conn = get_db2_connect(conf.output_db_url) import dao.output.db2_helper as output_helper else: logging.error("输出配置数据库未适配 :{}".format(conf.output_db)) exit(-1) # 获取所有外键分析系统 fk_sys_all = get_fk_sys(output_conn, conf.output_schema) # 获取配置表信息 analysis_conf_dict = output_helper.get_config_info(output_conn, conf.output_schema) # 读取全部表的分析进度情况 analysis_schedule_dict = output_helper.get_analysis_schedule( output_conn, conf.output_schema) # 用于存放待单一外键分析的字典 table_need_analysis_dict = {} for (sys_code, ori_table_code) in analysis_conf_dict:
def fd_merge_main(conf, sys_code, table_code, start_date_str): schema = conf.output_schema conn = None if conf.output_db == "db2": conn = get_db2_connect(conf.output_db_url) import dao.output.db2_helper as output_helper else: logging.error("输出配置数据库未适配 :{}".format(conf.output_db)) exit(-1) try: # key为函数依赖关系右部,value为能推出右部的函数依赖关系左部tuple fd_dict_1, fd_dict_2 = output_helper.get_fd_tmp( conn, schema, sys_code, table_code) except Exception as e: logging.error("临时函数依赖关系读取失败 :{}:{}".format(table_code, e)) return '001' # 函数依赖关系右部取交集 right_cols = list(set(fd_dict_1.keys()) & set(fd_dict_2.keys())) merge_res = {} try: # 遍历函数依赖关系右部交集 for right_col in tqdm(right_cols): fd_intersect = set(fd_dict_1[right_col]) & set( fd_dict_2[right_col]) left_col_list = list(fd_intersect) fd_diff_1 = set(fd_dict_1[right_col]) - fd_intersect fd_diff_2 = set(fd_dict_2[right_col]) - fd_intersect for fd_1 in list(fd_diff_1): for fd_2 in list(fd_diff_2): fd_1 = set(fd_1) fd_2 = set(fd_2) if fd_1 & fd_2: fd_new = fd_1 | fd_2 fd_new = list(fd_new) fd_new.sort() left_col_list.append(tuple(fd_new)) left_col_list = list(set(left_col_list)) print('{}:{}'.format(right_col, len(left_col_list))) left_col_list.sort(key=lambda i: len(i)) # 依赖关系化成最简 # left_col_list_res = left_col_list.copy() # for fd in left_col_list: # for fd_sub in left_col_list_res: # if fd == fd_sub: # continue # else: # if len(fd) < len(fd_sub): # break # if set(fd_sub).issubset(set(fd)): # if fd in left_col_list_res: # left_col_list_res.remove(fd) # break # merge_res[right_col] = left_col_list_res # merge_flag = True fd_sub_num = 0 left_cols = left_col_list.copy() left_cols_tmp = left_cols.copy() max_len = max([len(i) for i in left_cols]) while True: fd_sub = left_cols[fd_sub_num] if len(fd_sub) == max_len or fd_sub_num == (len(left_cols) - 1): break fd_sub = set(fd_sub) # left_cols_tmp = left_cols.copy() for fd in left_cols[fd_sub_num + 1:]: if len(fd) == len(fd_sub): continue if fd_sub.issubset(set(fd)): left_cols_tmp.remove(fd) left_cols = left_cols_tmp.copy() fd_sub_num += 1 merge_res[right_col] = left_cols_tmp # print(merge_res) except Exception as e: logging.error("函数依赖关系合并失败 :{}:{}".format(table_code, e)) return '002' code = output_helper.fd_merge_save(conn, schema, sys_code, table_code, merge_res, start_date_str) if code == 0: logging.info("函数依赖关系合并完成 :{}".format(table_code)) ibm_db.close(conn)
def dim_division(conf): """ 维度划分逻辑处理 1、在数据库中读取数据函数依赖关系和外键关系,目前暂时只取FD_LEVEL = 1的函数依赖关系和单一外键关系 2、将两个结果集转换为pandas的DataFrame 3、对数据进行初步处理,主要是删除重复项 4、去重之后的FD关系和FK关系进行开始进行维度划分 4-1、处理FK关系,对FK关系进行扩展,可以进行FK关系互推的将其合并为一个节点 4-2、删除fk和fd关系中,指向自身的关系,如 LEFT RIGHT RL 0 A/B A/B FK 1 A/B A/B FK 都会被删掉 5、初步找出维度主节点,原则是:外键关系只出不进的认为是维度主节点 6、整理fd和fk关系 7、遍历初步判定的维度主节点,找出每个维度节点的属性列表和子集列表,属性列表指的是FD关系,子集列表指的是FK关系 8、检查初步判定的维度节点之间的关系,检查初步判定的维度节点是否可作为维度主节点,原则是如果一个维度节点不做任何节点的属性,或者做两个以上节点的属性,则该节点认定为维度主节点 9、对无法作为维度主节点的节点,找到其对应的维度主节点 10、对同表中可以互推的维度主节点进行合并 11、整理所有节点所属的维度 12、保存维度划分结果 :param conf: :return: """ assert isinstance(conf, Config) output_conn = None output_helper = None if conf.output_db == "db2": import dao.output.db2_helper as output_helper output_conn = get_db2_connect(conf.output_db_url) else: logging.error("输出配置数据库未适配:{}".format(conf.output_db)) exit(-1) logging.info('开始删除旧的维度划分结果') del_result_code = output_helper.del_old_dim_dive_result( output_conn, conf.output_schema) if del_result_code == 0: logging.info('删除旧的维度划分结果完成') elif del_result_code == -1: logging.error('删除旧的维度划分结果失败') ibm_db.close(output_conn) exit(-1) else: logging.error('删除旧的维度划分结果返回未知的状态码{}'.format(del_result_code)) ibm_db.close(output_conn) exit(-1) # 1、在数据库中读取数据函数依赖关系和外键关系,目前暂时只取FD_LEVEL = 1的函数依赖关系和单一外键关系 logging.info('开始读取数据') # FD_LEVEL = 1的函数依赖关系 fd_dict_from_db = output_helper.get_function_dependency( output_conn, conf.output_schema) # 单一外键关系 fk_dict_from_db = output_helper.get_single_fk_relation( output_conn, conf.output_schema) # FIXME 从ANAschema中拿到函数依赖关系和外键关系,用来校验程序,后续可以删掉 # fd_dict_from_db = output_helper.get_fd_for_dim_dive(output_conn, "ANA") # fk_dict_from_db = output_helper.get_fk_for_dim_dive(output_conn, "ANA") # 2、将两个结果集转换为pandas的DataFrame fd_df = pd.DataFrame(fd_dict_from_db) fk_df = pd.DataFrame(fk_dict_from_db) # 3、对数据进行初步处理,主要是删除重复项 fd_df = fd_df.drop_duplicates() fk_df = fk_df.drop_duplicates() # 全部的字段关系 all_relation_df = fd_df.append(fk_df) logging.info('数据读取完成') # 4、去重之后的FD关系和FK关系进行开始进行维度划分 # 4-1、处理FK关系,对FK关系进行扩展,可以进行FK关系互推的将其合并为一个节点 logging.info('外键关系互推节点合并开始') unexpand_fk_dict = {} # 遍历外键,得到fk_dict,key为外键关系左部,value为列表,列表中是外键关系右部 for index, row in fk_df.iterrows(): if row['LEFT'] in unexpand_fk_dict.keys(): unexpand_fk_dict[row['LEFT']].append(row['RIGHT']) else: unexpand_fk_dict[row['LEFT']] = [row['RIGHT']] # 对fk_dict进行处理,找出每个左部节点的全部的外键关系,即每个节点通过外键关系可以推出的全部节点,这样如果有互推关系,A字段FK关系推B,B字段FK关系推A,则能扩展出A能推出A expand_fk_dict = expand_fk_relation(unexpand_fk_dict) # 基于扩展后的FK关系,得到FK互推的字段 dfeo_fd_dict = get_derive_from_each_other_fk(expand_fk_dict) # 修改关系,合并可互推节点,可以互推的节点视为一个节点 if os.path.exists('../tmp/after_merge_fk_df.csv') and os.path.exists( '../tmp/after_merge_all_rela_df.csv'): logging.info('已存在修改完成的关系,直接读取') fk_df = pd.read_csv('../tmp/after_merge_fk_df.csv') all_relation_df = pd.read_csv('../tmp/after_merge_all_rela_df.csv') else: for key in dfeo_fd_dict.keys(): fk_df.loc[fk_df.LEFT == key, 'LEFT'] = dfeo_fd_dict[key] fk_df.loc[fk_df.RIGHT == key, 'RIGHT'] = dfeo_fd_dict[key] all_relation_df.loc[all_relation_df.LEFT == key, 'LEFT'] = dfeo_fd_dict[key] all_relation_df.loc[all_relation_df.RIGHT == key, 'RIGHT'] = dfeo_fd_dict[key] fk_df.to_csv('../tmp/after_merge_fk_df.csv', index_label='index_label') all_relation_df.to_csv('../tmp/after_merge_all_rela_df.csv', index_label='index_label') logging.info('外键关系互推节点合并完成') # 4-2、删除fk和所有关系中,指向自身的关系 logging.info('删除fk和所有关系中指向自身的fk关系开始') fk_drop_index = [] for index, row in fk_df.iterrows(): if row['LEFT'] == row['RIGHT']: fk_drop_index.append(index) fk_df = fk_df.drop(fk_drop_index, axis=0) all_rela_drop_index = [] for index, row in all_relation_df.iterrows(): if row['LEFT'] == row['RIGHT']: all_rela_drop_index.append(index) all_relation_df = all_relation_df.drop(all_rela_drop_index, axis=0) logging.info('已删除fk和所有关系中指向自身的fk关系') # 5、初步找出维度主节点,原则是:外键关系只出不进的认为是维度主节点 logging.info('初步找出维度主节点开始') if os.path.exists('../tmp/candidate_dim_node_list.pickle'): with open('../tmp/candidate_dim_node_list.pickle', 'rb') as p: candidate_dim_node_list = pickle.load(p) else: candidate_dim_node_list = find_candidate_dim_node(fk_df) with open('../tmp/candidate_dim_node_list.pickle', 'wb') as p: pickle.dump(candidate_dim_node_list, p) logging.info('已初步找出维度主节点') # 6、整理fd和fk关系,为寻找维度主节点的属性集合和子集集合做准备 # key为fd关系左部节点,value为fd关系左部节点能够推出的右部节点set集合 after_arrange_fd_dict = {} # key为fk关系左部节点,value为fk关系左部节点能够推出的右部节点set集合 after_arrange_fk_dict = {} for index, row in all_relation_df.iterrows(): if row['RL'] == 'FD': if row['LEFT'] in after_arrange_fd_dict.keys(): # 求并集,即A字段FD推B字段,A字段FD推C字段... after_arrange_fd_dict[row['LEFT']] = after_arrange_fd_dict[ row['LEFT']] | {row['RIGHT']} else: after_arrange_fd_dict[row['LEFT']] = {row['RIGHT']} elif row['RL'] == 'FK': if row['LEFT'] in after_arrange_fk_dict.keys(): # 求并集,即A字段FK推B字段,A字段FK推C字段... after_arrange_fk_dict[row['LEFT']] = after_arrange_fk_dict[ row['LEFT']] | {row['RIGHT']} else: after_arrange_fk_dict[row['LEFT']] = {row['RIGHT']} else: logging.error("系统无法识别{}关系,无法进行维度划分".format(row['RL'])) ibm_db.close(output_conn) exit(-1) logging.info('函数依赖关系和外键关系整理完毕') # 所有候选维度主节点的全部属性关系,[(dim_node, attr1),(dim_node, attr2),...] all_attr_rela_list = [] # 所有候选维度主节点的全部子集关系,[(dim_node, subset1),(dim_node, subset2),...] all_subset_rela_list = [] # 所有候选维度主节点的属性关系和子集关系,{'dim1':[[attr,...],[subset,...]], 'dim2':[[],[]],...} all_dim_node_rela = {} if os.path.exists('../tmp/all_attr_rela.pickle') and os.path.exists('../tmp/all_subset_rela.pickle') \ and os.path.exists('../tmp/all_dim_node_rela.pickle'): with open('../tmp/all_attr_rela.pickle', 'rb') as p: all_attr_rela_list = pickle.load(p) with open('../tmp/all_subset_rela.pickle', 'rb') as p: all_subset_rela_list = pickle.load(p) with open('../tmp/all_dim_node_rela.pickle', 'rb') as p: all_dim_node_rela = pickle.load(p) # 7、遍历初步判定的维度主节点,找出每个维度节点的属性列表和子集列表,属性列表指的是FD关系,子集列表指的是FK关系 logging.info('开始寻找每个维度节点的属性列表和子集列表') for i in range(len(candidate_dim_node_list)): candidate_dim_node = candidate_dim_node_list[i] # 如果这个维度主节点分析过了,就跳过不分析 if candidate_dim_node in all_dim_node_rela.keys(): continue attr_list, subset_list = find_attr_and_subset(candidate_dim_node, after_arrange_fd_dict, after_arrange_fk_dict) # 准备把结果写成pickle文件 all_dim_node_rela[candidate_dim_node] = [attr_list, subset_list] for attr in attr_list: all_attr_rela_list.append((candidate_dim_node, attr)) for subset in subset_list: all_subset_rela_list.append((candidate_dim_node, subset)) all_attr_rela_list = list(set(all_attr_rela_list)) all_subset_rela_list = list(set(all_subset_rela_list)) with open('../tmp/all_attr_rela.pickle', 'wb') as p: pickle.dump(all_attr_rela_list, p) with open('../tmp/all_subset_rela.pickle', 'wb') as p: pickle.dump(all_subset_rela_list, p) with open('../tmp/all_dim_node_rela.pickle', 'wb') as p: pickle.dump(all_dim_node_rela, p) logging.info('已找出每个维度节点的属性列表和子集列表') # 8、检查初步判定的维度节点之间的关系,检查初步判定的维度节点是否可作为维度主节点,原则是如果一个维度节点不做任何节点的属性,或者做两个以上节点的属性,则该节点认定为维度主节点 dim_main_node_check_res, group_dict, group_num_dict = dim_main_node_check( candidate_dim_node_list, all_attr_rela_list, all_dim_node_rela) # 9、对无法作为维度主节点的节点,找到其对应的维度主节点,结果为:key为初步判定的维度节点名,value为True表示该节点为维度主节点,为其他值表示该key不是维度主节点,value是其维度主节点 candidate_node_find_main_node_res = candidate_node_find_main_node( dim_main_node_check_res) # 10、对同表中可以互推的维度主节点进行合并 logging.info('开始合并相同维度') ori_fd_dict = {} for index, row in fd_df.iterrows(): if row['LEFT'] not in ori_fd_dict.keys(): ori_fd_dict[row['LEFT']] = [] ori_fd_dict[row['LEFT']].append(row['RIGHT']) else: ori_fd_dict[row['LEFT']].append(row['RIGHT']) # 同表中可以互推的维度节点进行合并 same_tab_dim_node_merge_res = same_tab_dim_node_merge( candidate_node_find_main_node_res, group_dict, group_num_dict) logging.info('已合并相同维度') # 11、整理所有节点所属的维度 logging.info('开始整理所有节点所属的维度') res_dict = { 'node': [], 'dim': [], 'orig_dim': [], 'type': [], 'del_flag': [] } # 遍历所有的属性节点,[(dim_node, attr),(dim_node, attr),...] for attr in all_attr_rela_list: res_dict['node'].append(attr[1]) # 是维度主节点 if same_tab_dim_node_merge_res[attr[0]] is True: res_dict['dim'].append(attr[0]) res_dict['orig_dim'].append(attr[0]) # 不是维度主节点,但是进行了合并,有可能有这种情况,就是同表的维度主节点进行了合并,导致了same_tab_dim_node_merge_res[attr[0]] 不是 True else: if same_tab_dim_node_merge_res[attr[0]][0] == '#': res_dict['dim'].append( same_tab_dim_node_merge_res[attr[0]][1:]) res_dict['orig_dim'].append(attr[0]) else: res_dict['dim'].append(same_tab_dim_node_merge_res[attr[0]]) res_dict['orig_dim'].append( same_tab_dim_node_merge_res[attr[0]]) res_dict['type'].append('attr') res_dict['del_flag'].append('1') # 遍历所有的子集节点,[(dim_node, subset),(dim_node, subset),...] for subset in all_subset_rela_list: # 如果子集节点不是某个节点的属性,则只考虑子集关系 if subset[1] not in res_dict['node']: res_dict['node'].append(subset[1]) if same_tab_dim_node_merge_res[subset[0]] is True: res_dict['dim'].append(subset[0]) res_dict['orig_dim'].append(subset[0]) else: if same_tab_dim_node_merge_res[subset[0]][0] == '#': res_dict['dim'].append( same_tab_dim_node_merge_res[subset[0]][1:]) res_dict['orig_dim'].append(subset[0]) else: res_dict['dim'].append( same_tab_dim_node_merge_res[subset[0]]) res_dict['orig_dim'].append( same_tab_dim_node_merge_res[subset[0]]) res_dict['type'].append('subset') res_dict['del_flag'].append('1') # 如果子集节点是某个节点的属性,则删除子集关系,保留属性关系 elif subset[1] in res_dict['node']: inds = [ ind for ind in range(len(res_dict['node'])) if res_dict['node'][ind] == subset[1] and res_dict['dim'][ind] == subset[0] ] del_flag = False for ind in inds: if res_dict['type'][ind] == 'attr': res_dict['del_flag'][ind] = '0' del_flag = True if not del_flag: res_dict['node'].append(subset[1]) if same_tab_dim_node_merge_res[subset[0]] is True: res_dict['dim'].append(subset[0]) res_dict['orig_dim'].append(subset[0]) else: if same_tab_dim_node_merge_res[subset[0]][0] == '#': res_dict['dim'].append( same_tab_dim_node_merge_res[subset[0]][1:]) res_dict['orig_dim'].append(subset[0]) else: res_dict['dim'].append( same_tab_dim_node_merge_res[subset[0]]) res_dict['orig_dim'].append( same_tab_dim_node_merge_res[subset[0]]) res_dict['type'].append('subset') res_dict['del_flag'].append('1') after_arrange_result_dict = node_arrange(res_dict) logging.info('节点归属维度修改完成') # 12、保存维度划分结果 logging.info('保存维度划分结果') dim_division_result_df = pd.DataFrame(after_arrange_result_dict) dim_division_result_df = dim_division_result_df.drop_duplicates() result_code = output_helper.save_dim_division_result( output_conn, conf.output_schema, dim_division_result_df) # 如果维度划分结果保存正常,则删除临时文件 if result_code == 0: del_temp_file() logging.info('保存维度划分结果完成') elif result_code == -1: logging.error("维度划分结果保存数据库失败") else: logging.error("维度划分结果保存数据库返回不支持的状态码") # 关闭数据库连接 ibm_db.close(output_conn)