def generate_candidate_solution(pairwise_max_rdc, table_index_dict, prep, max_budget, schema, max_no_relationships, rdc_threshold): spn_relationships_list = set() all_merged_tables = set() learning_costs = 0 # Basis of either binary or single SPNs # create every relationship above threshold for relationship_obj in schema.relationships: relationship_list = [relationship_obj.identifier] merged_tables = [relationship_obj.start, relationship_obj.end] if candidate_rdc_sum_means(pairwise_max_rdc, table_index_dict, [ (relationship_list, merged_tables) ]) > rdc_threshold: # learning_costs += learning_cost(prep, [relationship_list]) all_merged_tables.update(merged_tables) spn_relationships_list.add( (frozenset(relationship_list), frozenset(merged_tables))) # add remaining single tables for table in set([table.table_name for table in schema.tables ]).difference(all_merged_tables): # learning_costs += learning_cost(prep, None, single_table=table) spn_relationships_list.add((frozenset(), frozenset([table]))) # In addition randomly select larger joins rejected_candidates = 0 while rejected_candidates < 5: no_joins = randint(2, max_no_relationships) relationship_list, merged_tables = create_random_join(schema, no_joins) current_costs = learning_cost(prep, [relationship_list]) # Already in ensemble if (frozenset(relationship_list), frozenset(merged_tables)) in spn_relationships_list: rejected_candidates += 1 continue # does not offer any benefit if candidate_rdc_sum_means(pairwise_max_rdc, table_index_dict, [ (relationship_list, merged_tables) ]) <= rdc_threshold: rejected_candidates += 1 continue # Can not be added because of budget if learning_costs + current_costs > max_budget: break # Can be added all_merged_tables.update(merged_tables) learning_costs += current_costs spn_relationships_list.add( (frozenset(relationship_list), frozenset(merged_tables))) return frozenset(spn_relationships_list), learning_costs
def prepare_sample_hdf(schema, hdf_path, max_table_data, sample_size): meta_data_path = hdf_path + '/meta_data.pkl' prep = JoinDataPreparator(meta_data_path, schema, max_table_data=max_table_data) new_meta_data = copy.deepcopy(prep.table_meta_data) def correct_meta_data(table): new_meta_data[table]['hdf_path'] = new_meta_data[table][ 'hdf_path'].replace(table, table + '_sampled') incoming_relationships = find_relationships(schema, table, incoming=True) for relationship_obj in incoming_relationships: new_meta_data[table][relationship_obj.identifier] = None outgoing_relationships = find_relationships(schema, table, incoming=False) for relationship_obj in outgoing_relationships: new_meta_data[table][relationship_obj.identifier]['length'] = 0 # find first table and sample max_join_relationships, _ = create_random_join(schema, len(schema.relationships)) start_table, _ = prep._find_start_table(max_join_relationships, 1) logger.debug(f"Creating sample for {start_table}") sampled_tables = {start_table} df_sample_cache = dict() df_full_samples, _, _, _ = prep.generate_n_samples( sample_size, single_table=start_table, drop_redundant_columns=False) df_sample_cache[start_table] = df_full_samples df_full_samples.to_hdf(f'{hdf_path}/{start_table}_sampled.hdf', key='df', format='table') correct_meta_data(start_table) while len(sampled_tables) < len(schema.tables): for relationship_obj in schema.relationships: if (relationship_obj.start in sampled_tables and not relationship_obj.end in sampled_tables) or ( relationship_obj.start not in sampled_tables and relationship_obj.end in sampled_tables): if relationship_obj.start in sampled_tables and not relationship_obj.end in sampled_tables: # outgoing edge, e.g. lineorders joined, join date next_joined_table = relationship_obj.end logger.debug(f"Creating sample for {next_joined_table}") next_table_data = prep._get_table_data( prep.table_meta_data[next_joined_table]['hdf_path'], next_joined_table) left_attribute = relationship_obj.end + '.' + relationship_obj.end_attr right_attribute = relationship_obj.start + '.' + relationship_obj.start_attr df_samples = df_sample_cache[relationship_obj.start] df_samples = df_samples.set_index(right_attribute, drop=False) next_table_data = next_table_data.set_index(left_attribute, drop=False) next_table_data = df_samples.merge(next_table_data, right_index=True, left_on=right_attribute) # only keep rows with join partner next_table_data = next_table_data[next_table_data[ relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name] > 0] elif relationship_obj.start not in sampled_tables and relationship_obj.end in sampled_tables: next_joined_table = relationship_obj.start logger.debug(f"Creating sample for {next_joined_table}") next_table_data = prep._get_table_data( prep.table_meta_data[next_joined_table]['hdf_path'], next_joined_table) left_attribute = relationship_obj.end + '.' + relationship_obj.end_attr right_attribute = relationship_obj.start + '.' + relationship_obj.start_attr df_samples = df_sample_cache[relationship_obj.end] df_samples = df_samples.set_index(left_attribute, drop=False) # df_samples.index.name = None next_table_data = next_table_data.set_index( right_attribute, drop=False) next_table_data = df_samples.merge(next_table_data, right_index=True, left_on=left_attribute) # only keep rows with join partner next_table_data = next_table_data[next_table_data[ relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name] > 0] if len(next_table_data) > sample_size: next_table_data = next_table_data.sample(sample_size) # only keep columns of interest del_cols = [] for col in next_table_data.columns: if col not in prep.table_meta_data[next_joined_table][ 'relevant_attributes_full']: del_cols.append(col) next_table_data.drop(columns=del_cols, inplace=True) df_sample_cache[next_joined_table] = next_table_data next_table_data.to_hdf( f'{hdf_path}/{next_joined_table}_sampled.hdf', key='df', format='table') correct_meta_data(next_joined_table) sampled_tables.add(next_joined_table) # different meta data with open(hdf_path + '/meta_data_sampled.pkl', 'wb') as f: pickle.dump(new_meta_data, f, pickle.HIGHEST_PROTOCOL)