def test_e2e(dod, number_jps=5): attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"] values = ["968548423", "kimball", "Mechanical Engineering"] # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"] # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"] i = 0 first = True first_mjp = None most_likely_key = None for mjp, attrs_project in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) # i += 1 # print(mjp.head(2)) # if i > number_jps: # break if first: first = False first_mjp = mjp most_likely_keys_info = mva.most_likely_key(first_mjp) most_likely_key = most_likely_keys_info[0][0] missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \ mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key) if len(conflicting_pair) > 0: print(str(conflicting_pair))
def rank_materializable_join_graphs(materializable_join_paths, table_path): def score_for_key(keys_score, target): for c, nunique, score in keys_score: if target == c: return score def aggr_avg(scores): scores = np.asarray(scores) return np.average(scores) def aggr_mul(scores): return reduce(operator.mul, scores) rank_jps = [] keys_cache = dict() for mjp in materializable_join_paths: jump_scores = [] for filter, l, r in mjp: table = l.source_name if table not in keys_cache: path = table_path[table] table_df = dpu.get_dataframe(path + "/" + table) likely_keys_sorted = mva.most_likely_key(table_df) keys_cache[table] = likely_keys_sorted likely_keys_sorted = keys_cache[table] jump_score = score_for_key(likely_keys_sorted, l.field_name) jump_scores.append(jump_score) jp_score_avg = aggr_avg(jump_scores) jp_score_mul = aggr_mul(jump_scores) rank_jps.append((mjp, jp_score_avg, jp_score_mul)) rank_jps = sorted(rank_jps, key=lambda x: x[1], reverse=True) return rank_jps
def rank_materializable_join_paths_piece(materializable_join_paths, candidate_group, table_path, dod): # compute rank list of likely keys for each table table_keys = dict() table_field_rank = dict() for table in candidate_group: if table in table_path: path = table_path[table] else: nid = (dod.aurum_api.make_drs(table)).data[0].nid path = dod.aurum_api.helper.get_path_nid(nid) table_path[table] = path table_df = dpu.get_dataframe(path + "/" + table) likely_keys_sorted = mva.most_likely_key(table_df) table_keys[table] = likely_keys_sorted field_rank = { payload[0]: i for i, payload in enumerate(likely_keys_sorted) } table_field_rank[table] = field_rank # 1) Split join paths into its pairs, then 2) sort each pair individually, then 3) assemble again num_jumps = sorted([len(x) for x in materializable_join_paths])[-1] jump_joins = {i: [] for i in range(num_jumps)} # 1) split for annotated_jp in materializable_join_paths: for i, jp in enumerate(annotated_jp): jump_joins[i].append(jp) def field_to_rank(table, field): return table_field_rank[table][field] # 2) sort for jump, joins in jump_joins.items(): joins = sorted( joins, key=lambda x: field_to_rank(x[1].source_name, x[1].field_name)) jump_joins[jump] = joins # 3) assemble ranked_materialized_join_paths = [ [] for _ in range(len(materializable_join_paths)) ] for jump, joins in jump_joins.items(): for i, join in enumerate(joins): ranked_materialized_join_paths[i].append(join) return ranked_materialized_join_paths
def test_e2e(dod, number_jps=5): # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"] # values = ["968548423", "kimball", "Mechanical Engineering"] attrs = ["Subject", "Title", "Publisher"] values = [ "", "Man who would be king and other stories", "Oxford university press, incorporated" ] # attrs = ["Iap Category Name", "Person Name", "Person Email"] # # values = ["", "Meghan Kenney", "*****@*****.**"] # values = ["Engineering", "", ""] # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"] # values = ["", "", "", ""] # attrs = ["c_name", "c_phone", "n_name", "l_tax"] # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""] # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"] # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"] i = 0 first = True first_mjp = None most_likely_key = None for mjp, attrs_project in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) # i += 1 # print(mjp.head(2)) # if i > number_jps: # break proj_view = dpu.project(mjp, attrs_project) if first: first = False first_mjp = mjp most_likely_keys_info = mva.most_likely_key(first_mjp) most_likely_key = most_likely_keys_info[0][0] missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \ mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key) if len(conflicting_pair) > 0: print(str(conflicting_pair))