def rank_materializable_join_graphs(materializable_join_paths, table_path): def score_for_key(keys_score, target): for c, nunique, score in keys_score: if target == c: return score def aggr_avg(scores): scores = np.asarray(scores) return np.average(scores) def aggr_mul(scores): return reduce(operator.mul, scores) rank_jps = [] keys_cache = dict() for mjp in materializable_join_paths: jump_scores = [] for filter, l, r in mjp: table = l.source_name if table not in keys_cache: path = table_path[table] table_df = dpu.get_dataframe(path + "/" + table) likely_keys_sorted = mva.most_likely_key(table_df) keys_cache[table] = likely_keys_sorted likely_keys_sorted = keys_cache[table] jump_score = score_for_key(likely_keys_sorted, l.field_name) jump_scores.append(jump_score) jp_score_avg = aggr_avg(jump_scores) jp_score_mul = aggr_mul(jump_scores) rank_jps.append((mjp, jp_score_avg, jp_score_mul)) rank_jps = sorted(rank_jps, key=lambda x: x[1], reverse=True) return rank_jps
def rank_materializable_join_paths_piece(materializable_join_paths, candidate_group, table_path, dod): # compute rank list of likely keys for each table table_keys = dict() table_field_rank = dict() for table in candidate_group: if table in table_path: path = table_path[table] else: nid = (dod.aurum_api.make_drs(table)).data[0].nid path = dod.aurum_api.helper.get_path_nid(nid) table_path[table] = path table_df = dpu.get_dataframe(path + "/" + table) likely_keys_sorted = mva.most_likely_key(table_df) table_keys[table] = likely_keys_sorted field_rank = { payload[0]: i for i, payload in enumerate(likely_keys_sorted) } table_field_rank[table] = field_rank # 1) Split join paths into its pairs, then 2) sort each pair individually, then 3) assemble again num_jumps = sorted([len(x) for x in materializable_join_paths])[-1] jump_joins = {i: [] for i in range(num_jumps)} # 1) split for annotated_jp in materializable_join_paths: for i, jp in enumerate(annotated_jp): jump_joins[i].append(jp) def field_to_rank(table, field): return table_field_rank[table][field] # 2) sort for jump, joins in jump_joins.items(): joins = sorted( joins, key=lambda x: field_to_rank(x[1].source_name, x[1].field_name)) jump_joins[jump] = joins # 3) assemble ranked_materialized_join_paths = [ [] for _ in range(len(materializable_join_paths)) ] for jump, joins in jump_joins.items(): for i, join in enumerate(joins): ranked_materialized_join_paths[i].append(join) return ranked_materialized_join_paths
def virtual_schema_iterative_search(self, list_attributes: [str], list_samples: [str], max_hops=2, debug_enumerate_all_jps=False): # Align schema definition and samples assert len(list_attributes) == len(list_samples) sch_def = { attr: value for attr, value in zip(list_attributes, list_samples) } sch_def = OrderedDict( sorted(sch_def.items(), key=lambda x: x[0], reverse=True)) filter_drs = self.joint_filters(sch_def) # We group now into groups that convey multiple filters. # Obtain list of tables ordered from more to fewer filters. table_fulfilled_filters = defaultdict(list) table_nid = dict( ) # collect nids -- used later to obtain an access path to the tables for filter, drs in filter_drs.items(): drs.set_table_mode() # All these tables fulfill the filter above for table in drs: # table_fulfilled_filters[table].append(filter) if filter[1] == FilterType.ATTR: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], None), FilterType.ATTR, filter[2])) elif filter[1] == FilterType.CELL: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: # filter in this column table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], c.field_name), FilterType.CELL, filter[2])) table_path = obtain_table_paths(table_nid, self) # sort by value len -> # fulfilling filters table_fulfilled_filters = OrderedDict( sorted(table_fulfilled_filters.items(), key=lambda el: (len({filter_id for _, _, filter_id in el[1]}), el[0]), reverse=True)) # len of unique filters, then lexico # Ordering filters for more determinism for k, v in table_fulfilled_filters.items(): v = sorted(v, key=lambda el: (el[2], el[0][0]), reverse=True) # sort by id, then filter_name table_fulfilled_filters[k] = v def eager_candidate_exploration(): def covers_filters(candidate_filters, all_filters): all_filters_set = set([id for _, _, id in filter_drs.keys()]) candidate_filters_set = set( [id for _, _, id in candidate_filters]) if len(candidate_filters_set) == len(all_filters_set): return True return False def compute_size_filter_ix(filters, candidate_group_filters_covered): new_fs_set = set([id for _, _, id in filters]) candidate_fs_set = set( [id for _, _, id in candidate_group_filters_covered]) ix_size = len( new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set) return ix_size def clear_state(): candidate_group.clear() candidate_group_filters_covered.clear() # Eagerly obtain groups of tables that cover as many filters as possible backup = [] go_on = True while go_on: candidate_group = [] candidate_group_filters_covered = set() for i in range(len(list(table_fulfilled_filters.items()))): table_pivot, filters_pivot = list( table_fulfilled_filters.items())[i] # Eagerly add pivot candidate_group.append(table_pivot) candidate_group_filters_covered.update(filters_pivot) # Did it cover all filters? # if len(candidate_group_filters_covered) == len(filter_drs.items()): if covers_filters(candidate_group_filters_covered, filter_drs.items()): candidate_group = sorted(candidate_group) # print("1: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered ) # early stop # Cleaning clear_state() continue for j in range(len(list(table_fulfilled_filters.items()))): idx = i + j + 1 if idx == len(table_fulfilled_filters.items()): break table, filters = list( table_fulfilled_filters.items())[idx] # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered) new_filters = compute_size_filter_ix( filters, candidate_group_filters_covered) if new_filters > 0: # add table only if it adds new filters candidate_group.append(table) candidate_group_filters_covered.update(filters) if covers_filters(candidate_group_filters_covered, filter_drs.items()): # if len(candidate_group_filters_covered) == len(filter_drs.items()): candidate_group = sorted(candidate_group) # print("2: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) clear_state() # Re-add the current pivot, only necessary in this case candidate_group.append(table_pivot) candidate_group_filters_covered.update( filters_pivot) candidate_group = sorted(candidate_group) # print("3: " + str(table_pivot)) if covers_filters(candidate_group_filters_covered, filter_drs.items()): yield (candidate_group, candidate_group_filters_covered) else: backup.append(([ el for el in candidate_group ], set([el for el in candidate_group_filters_covered]))) # Cleaning clear_state() # before exiting, return backup in case that may be useful for candidate_group, candidate_group_filters_covered in backup: yield (candidate_group, candidate_group_filters_covered) go_on = False # finished exploring all groups # Find ways of joining together each group cache_unjoinable_pairs = defaultdict(int) for candidate_group, candidate_group_filters_covered in eager_candidate_exploration( ): print("") print("Candidate group: " + str(candidate_group)) num_unique_filters = len( {f_id for _, _, f_id in candidate_group_filters_covered}) print("Covers #Filters: " + str(num_unique_filters)) if len(candidate_group) == 1: table = candidate_group[0] path = table_path[table] materialized_virtual_schema = dpu.get_dataframe(path + "/" + table) attrs_to_project = dpu.obtain_attributes_to_project( candidate_group_filters_covered) # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = 1 view_metadata["join_graph"] = { "nodes": [{ "id": -101010, "label": table }], "edges": [] } yield materialized_virtual_schema, attrs_to_project, view_metadata continue # to go to the next group # Pre-check # TODO: with a connected components index we can pre-filter many of those groups without checking #group_with_all_relations, join_path_groups = self.joinable(candidate_group, cache_unjoinable_pairs) max_hops = max_hops # We find the different join graphs that would join the candidate_group join_graphs = self.joinable(candidate_group, cache_unjoinable_pairs, max_hops=max_hops) if debug_enumerate_all_jps: for i, group in enumerate(join_graphs): print("Group: " + str(i)) for el in group: print(el) continue # We are just interested in all JPs for all candidate groups # if not graphs skip next if len(join_graphs) == 0: print("Group: " + str(candidate_group) + " is Non-Joinable with max_hops=" + str(max_hops)) continue # Now we need to check every join graph individually and see if it's materializable. Only once we've # exhausted these join graphs we move on to the next candidate group. We know already that each of the # join graphs covers all tables in candidate_group, so if they're materializable we're good. # materializable_join_graphs = [] for jpg in join_graphs: # Obtain filters that apply to this join graph filters = set() for l, r in jpg: if l.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[l.source_name]) if r.source_name in table_fulfilled_filters: filters.update(table_fulfilled_filters[r.source_name]) # TODO: obtain join_graph score for diff metrics. useful for ranking later # rank_materializable_join_graphs(materializable_join_paths, table_path, dod) is_join_graph_valid = self.is_join_graph_materializable( jpg, table_fulfilled_filters) if is_join_graph_valid: attrs_to_project = dpu.obtain_attributes_to_project( filters) materialized_virtual_schema = dpu.materialize_join_graph( jpg, self) # Create metadata to document this view view_metadata = dict() view_metadata["#join_graphs"] = len(join_graphs) # view_metadata["join_graph"] = self.format_join_paths_pairhops(jpg) view_metadata[ "join_graph"] = self.format_join_graph_into_nodes_edges( jpg) yield materialized_virtual_schema, attrs_to_project, view_metadata print("Finished enumerating groups") cache_unjoinable_pairs = OrderedDict( sorted(cache_unjoinable_pairs.items(), key=lambda x: x[1], reverse=True)) for k, v in cache_unjoinable_pairs.items(): print(str(k) + " => " + str(v))
def virtual_schema_iterative_search(self, list_attributes: [str], list_samples: [str], debug_enumerate_all_jps=False): # Align schema definition and samples assert len(list_attributes) == len(list_samples) sch_def = { attr: value for attr, value in zip(list_attributes, list_samples) } sch_def = OrderedDict( sorted(sch_def.items(), key=lambda x: x[0], reverse=True)) filter_drs = self.joint_filters(sch_def) # We group now into groups that convey multiple filters. # Obtain list of tables ordered from more to fewer filters. table_fulfilled_filters = defaultdict(list) table_nid = dict( ) # collect nids -- used later to obtain an access path to the tables for filter, drs in filter_drs.items(): drs.set_table_mode() # All these tables fulfill the filter above for table in drs: # table_fulfilled_filters[table].append(filter) if filter[1] == FilterType.ATTR: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], None), FilterType.ATTR, filter[2])) elif filter[1] == FilterType.CELL: columns = [c for c in drs.data] # copy for c in columns: if c.source_name == table: # filter in this column table_nid[table] = c.nid # if filter not in table_fulfilled_filters[table]: if filter[2] not in [ id for _, _, id in table_fulfilled_filters[table] ]: table_fulfilled_filters[table].append( ((filter[0], c.field_name), FilterType.CELL, filter[2])) table_path = obtain_table_paths(table_nid, self) # sort by value len -> # fulfilling filters table_fulfilled_filters = OrderedDict( sorted(table_fulfilled_filters.items(), key=lambda el: (len({filter_id for _, _, filter_id in el[1]}), el[0]), reverse=True)) # len of unique filters, then lexico # Ordering filters for more determinism for k, v in table_fulfilled_filters.items(): v = sorted(v, key=lambda el: (el[2], el[0][0]), reverse=True) # sort by id, then filter_name table_fulfilled_filters[k] = v def eager_candidate_exploration(): def covers_filters(candidate_filters, all_filters): all_filters_set = set([id for _, _, id in filter_drs.keys()]) candidate_filters_set = set( [id for _, _, id in candidate_filters]) if len(candidate_filters_set) == len(all_filters_set): return True return False def compute_size_filter_ix(filters, candidate_group_filters_covered): new_fs_set = set([id for _, _, id in filters]) candidate_fs_set = set( [id for _, _, id in candidate_group_filters_covered]) ix_size = len( new_fs_set.union(candidate_fs_set)) - len(candidate_fs_set) return ix_size def clear_state(): candidate_group.clear() candidate_group_filters_covered.clear() # Eagerly obtain groups of tables that cover as many filters as possible go_on = True while go_on: candidate_group = [] candidate_group_filters_covered = set() for i in range(len(list(table_fulfilled_filters.items()))): table_pivot, filters_pivot = list( table_fulfilled_filters.items())[i] # Eagerly add pivot candidate_group.append(table_pivot) candidate_group_filters_covered.update(filters_pivot) # Did it cover all filters? # if len(candidate_group_filters_covered) == len(filter_drs.items()): if covers_filters(candidate_group_filters_covered, filter_drs.items()): candidate_group = sorted(candidate_group) # print("1: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered ) # early stop # Cleaning clear_state() continue for j in range(len(list(table_fulfilled_filters.items()))): idx = i + j + 1 if idx == len(table_fulfilled_filters.items()): break table, filters = list( table_fulfilled_filters.items())[idx] # new_filters = len(set(filters).union(candidate_group_filters_covered)) - len(candidate_group_filters_covered) new_filters = compute_size_filter_ix( filters, candidate_group_filters_covered) if new_filters > 0: # add table only if it adds new filters candidate_group.append(table) candidate_group_filters_covered.update(filters) if covers_filters(candidate_group_filters_covered, filter_drs.items()): # if len(candidate_group_filters_covered) == len(filter_drs.items()): candidate_group = sorted(candidate_group) # print("2: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) clear_state() # Re-add the current pivot, only necessary in this case candidate_group.append(table_pivot) candidate_group_filters_covered.update( filters_pivot) candidate_group = sorted(candidate_group) # print("3: " + str(table_pivot)) yield (candidate_group, candidate_group_filters_covered) # Cleaning clear_state() go_on = False # finished exploring all groups # Find ways of joining together each group cache_unjoinable_pairs = defaultdict(int) for candidate_group, candidate_group_filters_covered in eager_candidate_exploration( ): print("") print("Candidate group: " + str(candidate_group)) num_unique_filters = len( {f_id for _, _, f_id in candidate_group_filters_covered}) print("Covers #Filters: " + str(num_unique_filters)) if len(candidate_group) == 1: table = candidate_group[0] path = table_path[table] materialized_virtual_schema = dpu.get_dataframe(path + "/" + table) attrs_to_project = dpu.obtain_attributes_to_project( (candidate_group_filters_covered, None)) yield materialized_virtual_schema, attrs_to_project continue # to go to the next group # Pre-check # TODO: with a connected components index we can pre-filter many of those groups without checking group_with_all_relations, join_path_groups = self.joinable( candidate_group, cache_unjoinable_pairs) if debug_enumerate_all_jps: print("Join paths which cover candidate group:") for jp in group_with_all_relations: print(jp) print("Join graphs which cover candidate group: ") for i, group in enumerate(join_path_groups): print("Group: " + str(i)) for el in group: print(el) continue # We are just interested in all JPs for all candidate groups # if not paths or graphs skip next if len(join_path_groups) == 0 and len( group_with_all_relations) == 0: print("Group: " + str(candidate_group) + " is Non-Joinable") continue # We first check if the group_with_all_relations is materializable materializable_join_paths = [] if len(group_with_all_relations) > 0: join_paths = self.tx_join_paths_to_pair_hops( group_with_all_relations) annotated_join_paths = self.annotate_join_paths_with_filter( join_paths, table_fulfilled_filters, candidate_group) # Check JP materialization print("Found " + str(len(annotated_join_paths)) + " candidate join paths") valid_join_paths = self.verify_candidate_join_paths( annotated_join_paths) print("Found " + str(len(valid_join_paths)) + " materializable join paths") materializable_join_paths.extend(valid_join_paths) # We need that at least one JP from each group is materializable if len(materializable_join_paths) == 0 and len( join_path_groups) == 0: print("No join graphs for this candidate group") continue print("Processing join graphs...") materializable_join_graphs = dict() for k, v in join_path_groups.items(): print("Pair: " + str(k)) join_paths = self.tx_join_paths_to_pair_hops(v) annotated_join_paths = self.annotate_join_paths_with_filter( join_paths, table_fulfilled_filters, candidate_group) # Check JP materialization print("Found " + str(len(annotated_join_paths)) + " candidate join paths for join graph") # For each candidate join_path, check whether it can be materialized or not, # then show to user (or the other way around) valid_join_paths = self.verify_candidate_join_paths( annotated_join_paths) print("Found " + str(len(valid_join_paths)) + " materializable join paths for join graph") if len(valid_join_paths) > 0: materializable_join_graphs[k] = valid_join_paths else: # This pair is non-materializable, but there may be other groups of pairs that cover # the same tables, therefore we can only continue, we cannot determine at this point that # the group is non-materializable, not yet. continue # Verify whether the join_graphs cover the group or not covered_tables = set(candidate_group) for k, _ in materializable_join_graphs.items(): (t1, t2) = k if t1 in covered_tables: covered_tables.remove(t1) if t2 in covered_tables: covered_tables.remove(t2) if len(covered_tables) > 0: # now we know there are not join graphs in this group, so we explicitly mark it as such materializable_join_graphs.clear() materializable_join_graphs = list( ) # next block of processing expects a list else: # 1) find key-groups keygroups = defaultdict(list) current_id = 0 for keygroup in itertools.combinations( list(materializable_join_graphs.keys()), len(candidate_group) - 1): for key in keygroup: keygroups[current_id].append( materializable_join_graphs[key]) current_id += 1 # 2) for each key-group, enumerate all paths unit_jp = [] for _, keygroup in keygroups.items(): # def unpack(packed_list): # for el in packed_list: # yield [v[0] for v in el] args = keygroup for comb in itertools.product(*args): unit_jp.append(comb) # pack units into more compact format materializable_join_graphs = [ ] # TODO: note we are rewriting the type of a var in scope for unit in unit_jp: packed_unit = [] for el in unit: packed_unit.append(el[0]) materializable_join_graphs.append(packed_unit) print("Processing join graphs...OK") # Merge join paths and join graphs, at this point the difference is meaningless # TODO: are paths necessarily contained in graphs? if so, simplify code above all_jgs = materializable_join_graphs + materializable_join_paths print("Processing materializable join paths...") # Sort materializable_join_paths by likely joining on key all_jgs_scores = rank_materializable_join_graphs( all_jgs, table_path, self) clean_jp = [] for annotated_jp, aggr_score, mul_score in all_jgs_scores: jp = [] filters = set() for filter, l, r in annotated_jp: # To drag filters along, there's a leaf special tuple where r may be None # since we don't need it at this point anymore, we check for its existence and do not include it if r is not None: jp.append((l, r)) if filter is not None: filters.update(filter) clean_jp.append((filters, jp)) import pickle with open("check_debug.pkl", 'wb') as f: pickle.dump(clean_jp, f) for mjp in clean_jp: attrs_to_project = dpu.obtain_attributes_to_project(mjp) # materialized_virtual_schema = dpu.materialize_join_path(mjp, self) materialized_virtual_schema = dpu.materialize_join_graph( mjp, self) yield materialized_virtual_schema, attrs_to_project print("Finished enumerating groups") cache_unjoinable_pairs = OrderedDict( sorted(cache_unjoinable_pairs.items(), key=lambda x: x[1], reverse=True)) for k, v in cache_unjoinable_pairs.items(): print(str(k) + " => " + str(v))