def run_quora_blocking(sampler = "iterative", lsh_args = None, sequential_args = None): if (sampler != "iterative") & (sampler != "naive"): raise ValueError("Sampler should be iterative or naive (completely random).") # Load Training Set according to sampler em.del_catalog() lhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_lhs.csv").rename(columns = {"Unnamed: 0":"id_lhs"}).sample(n = 1500, random_state = 52) rhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_rhs.csv").rename(columns = {"Unnamed: 0":"id_rhs"}).sample(n = 1500, random_state = 52) y_train = pd.read_csv("../data/processed_quora/quora_" + sampler + "_y_train.csv") em.del_catalog() em.set_key(lhs_table, "id_lhs") em.set_key(rhs_table, "id_rhs") n_train = lhs_table.shape[0] # Blocking blocking_cols = ["question1","question2"] feature_cols = [["question1"], ['question2']] id_names = ["qid1","qid2"] lsh_blocking_col_ids = 1 print("Blocking Train Set of Quora using LSH only.") candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 2, ["qid1","qid2"], char_ngram = lsh_args["char_ngram"], seeds = lsh_args["seeds"], bands = lsh_args["bands"]) print(f"Generated Candidate size has {candidates.shape[0]} rows") return NotImplementedError
def setup(self): path_for_A = os.sep.join([datasets_path, 'music', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'music', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'Sno') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'Sno') l_output_attrs = [ 'Album_Name', 'Artist_Name', 'CopyRight', 'Released', 'Song_Name', 'Time' ] r_output_attrs = [ 'Album_Name', 'Artist_Name', 'Copyright', 'Released', 'Song_Name', 'Time' ] C = ob.block_tables(A, B, 'Album_Name', 'Album_Name', rem_stop_words=True, l_output_attrs=l_output_attrs, r_output_attrs=r_output_attrs) self.D = ob.block_candset(C, 'Artist_Name', 'Artist_Name', rem_stop_words=True) except AssertionError: print("Dataset \'music\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'id') C = ab.block_tables(A, B, 'city_posted', 'city_posted', l_output_attrs, r_output_attrs) self.D = ab.block_candset(C, 'model_year', 'model_year') bb.set_black_box_function(_bikes_function) except AssertionError: print("Dataset \'bikes\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False) self.rb = em.RuleBasedBlocker()
def setup(self): path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'id') l_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] r_output_attrs = [ 'bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year' ] self.C = ab.block_tables(A, B, 'city_posted', 'city_posted', l_output_attrs, r_output_attrs) except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'model_year' self.r_block_attr = 'model_year'
def test_ob_block_candset_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1, allow_missing=True) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2, rem_stop_words=True, overlap_size=4) validate_metadata_two_candsets(C, D) validate_data(D, expected_ids_2)
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.feature_table = em.get_features_for_blocking(self.A, self.B) self.rb = em.RuleBasedBlocker()
def test_ab_block_tuples_wi_missing_values_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') assert_equal( self.ab.block_tuples(A.loc[0], B.loc[0], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[1], B.loc[2], l_block_attr_1, r_block_attr_1), False) assert_equal( self.ab.block_tuples(A.loc[2], B.loc[1], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[0], B.loc[1], l_block_attr_1, r_block_attr_1), True) assert_equal( self.ab.block_tuples(A.loc[2], B.loc[2], l_block_attr_1, r_block_attr_1), True)
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False) self.brm = em.BooleanRuleMatcher()
def read_tables(params): global table_A, table_B basedir = params['basedir'] apath = os.path.join(basedir, params['apath']) bpath = os.path.join(basedir, params['bpath']) table_A = em.read_csv_metadata(apath, key='id') table_B = em.read_csv_metadata(bpath, key='id')
def load_known_dataset(dataset_name): dataset_dtls = configs.er_dataset_details[dataset_name] l_file_name = dataset_dtls["dataset_folder_path"] + dataset_dtls["ltable_file_name"] r_file_name = dataset_dtls["dataset_folder_path"] + dataset_dtls["rtable_file_name"] #Assumption: key name is always "id" A = em.read_csv_metadata(l_file_name , key="id", encoding='utf-8') B = em.read_csv_metadata(r_file_name , key="id", encoding='utf-8') return A, B
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False) self.C['neg_trig_labels'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] self.C['pos_trig_labels'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.mt = em.MatchTrigger()
def main(): A = em.read_csv_metadata('../Data/A_imdb.csv', key='id') B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id') ab = em.AttrEquivalenceBlocker() shared_attributes = ['title', 'directors', 'release_year', 'languages'] C = ab.block_tables(A, B, 'directors', 'directors', l_output_attrs=shared_attributes, r_output_attrs=shared_attributes) # Take a sample of 10 pairs S = em.sample_table(C, 100) print(S) G = em.label_table(S, label_column_name='gold_labels') train_test = em.split_train_test(G, train_proportion=0.5) train, test = train_test['train'], train_test['test'] # Get feature for matching match_f = em.get_features_for_matching(A, B) H = em.extract_feature_vecs(train, attrs_before=['ltable_title', 'rtable_title'], feature_table=match_f, attrs_after=['gold_labels']) H.fillna(value=0, inplace=True) print(H) # Specifying Matchers and Performing Matching. dt = em.DTMatcher(max_depth=5) # A decision tree matcher. # Train the matcher dt.fit(table=H, exclude_attrs=[ '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title', 'gold_labels' ], target_attr='gold_labels') # Predict F = em.extract_feature_vecs(test, attrs_before=['ltable_title', 'rtable_title'], feature_table=match_f, attrs_after=['gold_labels']) F.fillna(value=0, inplace=True) print(F) pred_table = dt.predict(table=F, exclude_attrs=[ '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title', 'gold_labels' ], target_attr='predicted_labels', return_probs=True, probs_attr='proba', append=True, inplace=True) print(pred_table) eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels') em.print_eval_summary(eval_summary)
def setup(self): path_for_a = os.sep.join([DATASET_PATH, 'books', 'A.csv']) path_for_b = os.sep.join([DATASET_PATH, 'books', 'B.csv']) try: self.A = em.read_csv_metadata(path_for_a) self.B = em.read_csv_metadata(path_for_b) self.size = 2000 self.y_param = 2 except AssertionError: print("Dataset \'books\' not found. Please visit the project website to download the dataset.") raise SystemExit
def test_ob_block_tables_wi_missing_values_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C, expected_ids_1)
def setup(self): path_for_A = os.sep.join([datasets_path, 'citations', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'citations', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'beer', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'beer', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'Label') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'Label') bb.set_black_box_function(_beer_function) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) except AssertionError: print("Dataset \'electronics\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') self.feature_table = mg.get_features_for_blocking(self.A, self.B) except AssertionError: print("Dataset \'anime\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'restaurants\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_ob_block_tuples_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') assert_equal(self.ob.block_tuples(A.ix[1], B.ix[3], l_overlap_attr_1, r_overlap_attr_1), True) assert_equal(self.ob.block_tuples(A.ix[3], B.ix[2], l_overlap_attr_1, r_overlap_attr_1), True) assert_equal(self.ob.block_tuples(A.ix[3], B.ix[3], l_overlap_attr_1, r_overlap_attr_1), True)
def test_ab_block_tables_wi_missing_values_allow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, True) validate_metadata(C, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) validate_data(C, expected_ids_3)
def setUp(self): self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B) self.feature_table = em.get_features_for_matching( self.A, self.B, validate_inferred_attr_types=False) self.C['neg_trig_labels'] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] self.C['pos_trig_labels'] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] self.mt = em.MatchTrigger()
def test_ab_block_candset_wi_missing_values_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2) validate_metadata_two_candsets(C, D) validate_data(D, [('a5','b5')])
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) self.l_block_attr = 'PHONENUMBER' self.r_block_attr = 'PHONENUMBER' self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'restaurants\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) self.l_block_attr = 'Year' self.r_block_attr = 'Year' self.l_output_attrs = ['Title', 'Year', 'Episodes'] self.r_output_attrs = ['Title', 'Year', 'Episodes'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) self.l_block_attr = 'Brand' self.r_block_attr = 'Brand' self.l_output_attrs = ['Brand', 'Amazon_Price'] self.r_output_attrs = ['Brand', 'Price'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'electronics\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def test_ob_block_tables_wi_missing_values_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1) validate_metadata(C) validate_data(C, expected_ids_1)
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) path_for_C = os.sep.join([datasets_path, 'books', 'C.csv']) try: self.A = em.read_csv_metadata(path_for_A) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_for_B) em.set_key(self.B, 'ID') self.C = em.read_csv_metadata(path_for_C) cm.set_candset_properties(self.C, '_id', 'ltable_ID', 'rtable_ID', self.A, self.B) except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.")
def test_ob_block_candset_wi_missing_vals_disallow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1, allow_missing=True) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2, rem_stop_words=True, overlap_size=4) validate_metadata_two_candsets(C, D) validate_data(D, expected_ids_2)
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = snb.block_tables(A, B, 'Author', 'Author', ['Title', 'Author', 'ISBN13', 'Publisher'], ['Title', 'Author', 'ISBN13', 'Publisher']) except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'ISBN13' self.r_block_attr = 'ISBN13'
def load_data(left_file_name, right_file_name, label_file_name, blocking_fn, include_self_join=False): A = em.read_csv_metadata(left_file_name, key="id", encoding='iso-8859-1') B = em.read_csv_metadata(right_file_name, key="id", encoding='iso-8859-1') try: G = pd.read_csv(label_file_name) except: G = None C = blocking_fn(A, B) if include_self_join: C_A = blocking_fn(A, A) C_B = blocking_fn(B, B) return A, B, G, C, C_A, C_B else: return A, B, G, C
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = ab.block_tables(A, B, 'Brand', 'Brand', ['Brand', 'Amazon_Price'], ['Brand', 'Price']) bb.set_black_box_function(_electronics_function) except AssertionError: print( "Dataset \'electronics\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = ab.block_tables(A, B, 'Year', 'Year', ['Title', 'Year', 'Episodes'], ['Title', 'Year', 'Episodes']) except AssertionError: print("Dataset \'anime\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'Episodes' self.r_block_attr = 'Episodes'
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) self.l_block_attr = 'Author' self.r_block_attr = 'Author' self.l_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] self.r_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def read_files(): # Read in data files A = em.read_csv_metadata(FOLDER + 'A.csv', key='id') # imdb data B = em.read_csv_metadata(FOLDER + 'B.csv', key='id') # tmdb data C = em.read_csv_metadata( FOLDER + 'C.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # candidates that survive blocking step G = em.read_csv_metadata(FOLDER + 'G.csv', key='_id', ltable=A, rtable=B, fk_ltable='l_id', fk_rtable='r_id') # labeled data return A, B, C, G
def test_ab_block_tables_wi_missing_values_allow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix, True) validate_metadata(C, l_output_attrs, r_output_attrs, l_output_prefix, r_output_prefix) validate_data(C, expected_ids_3)
def test_ab_block_candset_wi_missing_values_disallow_missing(self): path_a = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv' ]) path_b = os.sep.join([ p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv' ]) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1) validate_metadata(C) validate_data(C, expected_ids_4) D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2) validate_metadata_two_candsets(C, D) validate_data(D, [('a5', 'b5')])
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') self.C = snb.block_tables( A, B, 'Author', 'Author', ['Title', 'Author', 'ISBN13', 'Publisher'], ['Title', 'Author', 'ISBN13', 'Publisher']) except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'ISBN13' self.r_block_attr = 'ISBN13'
def setup(self): path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'ID') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'ID') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4, l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'], r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'books', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'books', 'B.csv']) self.l_block_attr = 'Author' self.r_block_attr = 'Author' self.l_output_attrs = [ 'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date' ] self.r_output_attrs = [ 'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date' ] try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'ID') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'ID') except AssertionError: print("Dataset \'books\' not found. Please visit the project" " website to download the dataset.") raise SystemExit
def setup(self): path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv']) try: A = mg.read_csv_metadata(path_for_A) mg.set_key(A, 'record_id') B = mg.read_csv_metadata(path_for_B) mg.set_key(B, 'record_id') ob = mg.OverlapBlocker() self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2, rem_stop_words = True, l_output_attrs=['title', 'author', 'publisher', 'date'], r_output_attrs=['title', 'author', 'publisher', 'date']) feature_table = mg.get_features_for_blocking(A,B) self.rb = mg.RuleBasedBlocker() self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table) except AssertionError: print("Dataset \'beer\' not found. Please visit the project " "website to download the dataset.") raise SystemExit
def setup(self): p = mg.get_install_path() path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv']) path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv']) try: self.A = mg.read_csv_metadata(path_for_A) mg.set_key(self.A, 'id') self.B = mg.read_csv_metadata(path_for_B) mg.set_key(self.B, 'id') except AssertionError: print("Dataset \'bikes\' not found. Please visit the project" " website to download the dataset.") raise SystemExit self.l_block_attr = 'city_posted' self.r_block_attr = 'city_posted' self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'] self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year']
def test_ab_block_tuples_wi_missing_values_allow_missing(self): path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_A_wi_missing_vals.csv']) path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker', 'table_B_wi_missing_vals.csv']) A = em.read_csv_metadata(path_a) em.set_key(A, 'ID') B = em.read_csv_metadata(path_b) em.set_key(B, 'ID') assert_equal(self.ab.block_tuples(A.ix[0], B.ix[0], l_block_attr_1, r_block_attr_1, allow_missing=True), False) assert_equal(self.ab.block_tuples(A.ix[1], B.ix[2], l_block_attr_1, r_block_attr_1, allow_missing=True), False) assert_equal(self.ab.block_tuples(A.ix[2], B.ix[1], l_block_attr_1, r_block_attr_1, allow_missing=True), False) assert_equal(self.ab.block_tuples(A.ix[0], B.ix[1], l_block_attr_1, r_block_attr_1, allow_missing=True), False) assert_equal(self.ab.block_tuples(A.ix[2], B.ix[2], l_block_attr_1, r_block_attr_1, allow_missing=True), True)
def setUp(self): self.A = em.read_csv_metadata(path_a) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_b) em.set_key(self.B, 'ID') self.ob = em.OverlapBlocker()
def setUp(self): self.A = em.read_csv_metadata(path_a) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_b) em.set_key(self.B, 'ID') self.ab = em.AttrEquivalenceBlocker()
def setUp(self): self.A = em.read_csv_metadata(path_a) em.set_key(self.A, 'ID') self.B = em.read_csv_metadata(path_b) em.set_key(self.B, 'ID') self.bb = em.BlackBoxBlocker()