def test_e2e(dod, number_jps=5): # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"] # values = ["968548423", "kimball", "Mechanical Engineering"] # # cannot search for numbers # attrs = ["s_name", "s_address", "ps_availqty"] # values = ["Supplier#000000001", "N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ", "7340"] # attrs = ["s_name", "s_address", "ps_comment"] # values = ["Supplier#000000001", "N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ", # "dly final packages haggle blithely according to the pending packages. slyly regula"] # attrs = ["n_name", "s_name", "c_name", "o_clerk"] # values = ["CANADA", "Supplier#000000013", "Customer#000000005", "Clerk#000000400"] # attrs = ["o_clerk", "o_orderpriority", "n_name"] # values = ["Clerk#000000951", "5-LOW", "JAPAN"] # attrs = ["Subject", "Title", "Publisher"] # values = ["", "Man who would be king and other stories", "Oxford university press, incorporated"] # attrs = ["Iap Category Name", "Person Name", "Person Email"] # # values = ["", "Meghan Kenney", "*****@*****.**"] # values = ["Engineering", "", ""] # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"] # values = ["", "", "", ""] # attrs = ["c_name", "c_phone", "n_name", "l_tax"] # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""] # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"] # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"] # attrs = ["Neighborhood ", "Total Population ", "Graduate Degree %"] # values = ["Cambridgeport", "", ""] attrs = ["Email Address", "Department Full Name"] values = ["*****@*****.**", ""] i = 0 for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) # i += 1 # print(mjp.head(2)) # if i > number_jps: # break proj_view = dpu.project(mjp, attrs_project) print(str(proj_view.head(10))) print("") input("Press any key to continue...")
def test_e2e(dod, number_jps=5): # attrs = ["Mit Id", "Krb Name", "Hr Org Unit Title"] # values = ["968548423", "kimball", "Mechanical Engineering"] attrs = ["Subject", "Title", "Publisher"] values = [ "", "Man who would be king and other stories", "Oxford university press, incorporated" ] # attrs = ["Iap Category Name", "Person Name", "Person Email"] # # values = ["", "Meghan Kenney", "*****@*****.**"] # values = ["Engineering", "", ""] # attrs = ["Building Name Long", "Ext Gross Area", "Building Room", "Room Square Footage"] # values = ["", "", "", ""] # attrs = ["c_name", "c_phone", "n_name", "l_tax"] # values = ["Customer#000000001", "25-989-741-2988", "BRAZIL", ""] # attrs = ["Last Name", "Building Name", "Bldg Gross Square Footage", "Department Name"] # values = ["Madden", "Ray and Maria Stata Center", "", "Dept of Electrical Engineering & Computer Science"] i = 0 first = True first_mjp = None most_likely_key = None for mjp, attrs_project in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) # i += 1 # print(mjp.head(2)) # if i > number_jps: # break proj_view = dpu.project(mjp, attrs_project) if first: first = False first_mjp = mjp most_likely_keys_info = mva.most_likely_key(first_mjp) most_likely_key = most_likely_keys_info[0][0] missing_keys, non_unique_df1, non_unique_df2, conflicting_pair = \ mva.inconsistent_value_on_key(first_mjp, mjp, key=most_likely_key) if len(conflicting_pair) > 0: print(str(conflicting_pair))
def findvs(): if request.method == 'POST': json_request = request.get_json() payload_str = json_request['payload'] payload = json.loads(payload_str) # Prepare input parameters to DoD list_attributes = ["" for k, v in payload.items() if k[0] == "0"] # measure number attrs list_samples = ["" for el in list_attributes ] # template for samples, 1 row only for now for k, v in payload.items(): row_idx = int(k[0]) col_idx = int(k[2]) if row_idx == 0: list_attributes[col_idx] = v else: list_samples[col_idx] = v # Obtain view - always create a new view_generator, we assume these are new views # global dod global view_generator view_generator = iter( dod.virtual_schema_iterative_search(list_attributes, list_samples, {})) mvs, attrs_to_project, view_metadata = next(view_generator) proj_view = dpu.project(mvs, attrs_to_project) analysis = obtain_view_analysis(proj_view) sample_view = proj_view.head(10) html_dataframe = sample_view.to_html() global matview matview = proj_view return jsonify({ "view": html_dataframe, "analysis": analysis, "joingraph": view_metadata })
def next_view(): if request.method == 'POST': # Obtain view - always create a new view_generator, we assume these are new views global view_generator try: mvs, attrs_to_project, view_metadata = next(view_generator) except StopIteration: print("finished exploring views") return jsonify({"view": "no-more-views", "analysis": 'no'}) proj_view = dpu.project(mvs, attrs_to_project) analysis = obtain_view_analysis(proj_view) sample_view = proj_view.head(10) html_dataframe = sample_view.to_html() global matview matview = proj_view return jsonify({ "view": html_dataframe, "analysis": analysis, "joingraph": view_metadata })
def run_dod(dod, attrs, values, output_path, max_hops=2, name=None): view_metadata_mapping = dict() i = 0 perf_stats = dict() st_runtime = time.time() for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search( attrs, values, perf_stats, max_hops=max_hops, debug_enumerate_all_jps=False): proj_view = dpu.project(mjp, attrs_project) if output_path is not None: view_path = output_path + "/view_" + str(i) proj_view.to_csv(view_path, encoding='latin1', index=False) # always store this # store metadata associated to that view view_metadata_mapping[view_path] = metadata i += 1 et_runtime = time.time() perf_stats['et_runtime'] = (et_runtime - st_runtime) print("#$# " + str(name)) print("#$# ") print("") pp.pprint(perf_stats) total_join_graphs = sum(perf_stats['num_join_graphs_per_candidate_group']) total_materializable_join_graphs = sum( perf_stats['materializable_join_graphs']) print("Total join graphs: " + str(total_join_graphs)) print("Total materializable join graphs: " + str(total_materializable_join_graphs)) print("") print("Total views: " + str(i)) print("#$# ")
def main(args): model_path = args.model_path separator = args.separator store_client = StoreHandler() network = fieldnetwork.deserialize_network(model_path) dod = DoD(network=network, store_client=store_client, csv_separator=separator) attrs = args.list_attributes.split(";") values = args.list_values.split(";") print(attrs) print(values) assert len(attrs) == len(values) i = 0 for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search( attrs, values, debug_enumerate_all_jps=False): print("JP: " + str(i)) proj_view = dpu.project(mjp, attrs_project) print(str(proj_view.head(10))) print("Metadata") print(metadata) if args.output_path: if args.full_view: mjp.to_csv(args.output_path + "/raw_view_" + str(i), encoding='latin1', index=False) proj_view.to_csv(args.output_path + "/view_" + str(i), encoding='latin1', index=False) # always store this i += 1 if args.interactive == "True": print("") input("Press any key to continue...")
def test_e2e(dod, attrs, values, number_jps=5, output_path=None, full_view=False, interactive=False): ### # Run Core DoD ### view_metadata_mapping = dict() i = 0 perf_stats = dict() st_runtime = time.time() for mjp, attrs_project, metadata in dod.virtual_schema_iterative_search( attrs, values, perf_stats, max_hops=2, debug_enumerate_all_jps=False): print("JP: " + str(i)) # i += 1 # print(mjp.head(2)) # if i > number_jps: # break proj_view = dpu.project(mjp, attrs_project) # print(str(proj_view.head(5))) # print("Metadata") # print(metadata) if output_path is not None: view_path = None if full_view: view_path = output_path + "/raw_view_" + str(i) mjp.to_csv(view_path, encoding='latin1', index=False) view_path = output_path + "/view_" + str(i) proj_view.to_csv(view_path, encoding='latin1', index=False) # always store this # store metadata associated to that view view_metadata_mapping[view_path] = metadata i += 1 if interactive: print("") input("Press any key to continue...") et_runtime = time.time() perf_stats['runtime'] = (et_runtime - st_runtime) pp.pprint(perf_stats) if 'num_join_graphs_per_candidate_group' in perf_stats: total_join_graphs = sum( perf_stats['num_join_graphs_per_candidate_group']) print("Total join graphs: " + str(total_join_graphs)) if 'materializable_join_graphs' in perf_stats: total_materializable_join_graphs = sum( perf_stats['materializable_join_graphs']) print("Total materializable join graphs: " + str(total_materializable_join_graphs)) print("Total views: " + str(i)) exit() ### # Run 4C ### # return groups_per_column_cardinality = v4c.main(output_path) for k, v in groups_per_column_cardinality.items(): compatible_groups = v['compatible'] contained_groups = v['contained'] complementary_group = v['complementary'] contradictory_group = v['contradictory'] print("Compatible views: " + str(len(compatible_groups))) print("Contained views: " + str(len(contained_groups))) print("Complementary views: " + str(len(complementary_group))) print("Contradictory views: " + str(len(contradictory_group)))