コード例 #1
0
def run_quora_blocking(sampler = "iterative", lsh_args = None, sequential_args = None):
    if (sampler != "iterative") & (sampler != "naive"):
        raise ValueError("Sampler should be iterative or naive (completely random).")

    # Load Training Set according to sampler
    em.del_catalog()
    lhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_lhs.csv").rename(columns = {"Unnamed: 0":"id_lhs"}).sample(n = 1500, random_state = 52)
    rhs_table = em.read_csv_metadata("../data/processed_quora/quora_" + sampler + "_X_train_rhs.csv").rename(columns = {"Unnamed: 0":"id_rhs"}).sample(n = 1500, random_state = 52)
    y_train = pd.read_csv("../data/processed_quora/quora_" + sampler + "_y_train.csv")
    em.del_catalog()
    em.set_key(lhs_table, "id_lhs")
    em.set_key(rhs_table, "id_rhs")

    n_train  = lhs_table.shape[0]

    # Blocking
    blocking_cols = ["question1","question2"]
    feature_cols  = [["question1"],
    ['question2']]
    id_names = ["qid1","qid2"]
    lsh_blocking_col_ids = 1

    print("Blocking Train Set of Quora using LSH only.") 
    candidates = lsh_blocking(lhs_table, rhs_table, lsh_blocking_col_ids, 2, ["qid1","qid2"], char_ngram = lsh_args["char_ngram"], seeds = lsh_args["seeds"], bands = lsh_args["bands"])
    print(f"Generated Candidate size has {candidates.shape[0]} rows")
    
    
    return NotImplementedError
コード例 #2
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'music', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'music', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'Sno')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'Sno')
         l_output_attrs = [
             'Album_Name', 'Artist_Name', 'CopyRight', 'Released',
             'Song_Name', 'Time'
         ]
         r_output_attrs = [
             'Album_Name', 'Artist_Name', 'Copyright', 'Released',
             'Song_Name', 'Time'
         ]
         C = ob.block_tables(A,
                             B,
                             'Album_Name',
                             'Album_Name',
                             rem_stop_words=True,
                             l_output_attrs=l_output_attrs,
                             r_output_attrs=r_output_attrs)
         self.D = ob.block_candset(C,
                                   'Artist_Name',
                                   'Artist_Name',
                                   rem_stop_words=True)
     except AssertionError:
         print("Dataset \'music\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #3
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
     l_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     r_output_attrs = [
         'bike_name', 'city_posted', 'km_driven', 'price', 'color',
         'model_year'
     ]
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'id')
         C = ab.block_tables(A, B, 'city_posted', 'city_posted',
                             l_output_attrs, r_output_attrs)
         self.D = ab.block_candset(C, 'model_year', 'model_year')
         bb.set_black_box_function(_bikes_function)
     except AssertionError:
         print("Dataset \'bikes\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #4
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False)
     self.rb = em.RuleBasedBlocker()
コード例 #5
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'id')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'id')
            l_output_attrs = [
                'bike_name', 'city_posted', 'km_driven', 'price', 'color',
                'model_year'
            ]
            r_output_attrs = [
                'bike_name', 'city_posted', 'km_driven', 'price', 'color',
                'model_year'
            ]
            self.C = ab.block_tables(A, B, 'city_posted', 'city_posted',
                                     l_output_attrs, r_output_attrs)
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit

        self.l_block_attr = 'model_year'
        self.r_block_attr = 'model_year'
コード例 #6
0
 def test_ob_block_candset_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A,
                              B,
                              l_overlap_attr_1,
                              r_overlap_attr_1,
                              allow_missing=True)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ob.block_candset(C,
                               l_overlap_attr_2,
                               r_overlap_attr_2,
                               rem_stop_words=True,
                               overlap_size=4)
     validate_metadata_two_candsets(C, D)
     validate_data(D, expected_ids_2)
コード例 #7
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B)
     self.rb = em.RuleBasedBlocker()
コード例 #8
0
 def test_ab_block_tuples_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     assert_equal(
         self.ab.block_tuples(A.loc[0], B.loc[0], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[1], B.loc[2], l_block_attr_1,
                              r_block_attr_1), False)
     assert_equal(
         self.ab.block_tuples(A.loc[2], B.loc[1], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[0], B.loc[1], l_block_attr_1,
                              r_block_attr_1), True)
     assert_equal(
         self.ab.block_tuples(A.loc[2], B.loc[2], l_block_attr_1,
                              r_block_attr_1), True)
コード例 #9
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.brm = em.BooleanRuleMatcher()
コード例 #10
0
def read_tables(params):
    global table_A, table_B
    
    basedir = params['basedir']
    apath = os.path.join(basedir, params['apath'])
    bpath = os.path.join(basedir, params['bpath'])
    
    table_A = em.read_csv_metadata(apath, key='id')
    table_B = em.read_csv_metadata(bpath, key='id')
コード例 #11
0
def load_known_dataset(dataset_name):
    dataset_dtls = configs.er_dataset_details[dataset_name]
    l_file_name = dataset_dtls["dataset_folder_path"] + dataset_dtls["ltable_file_name"]
    r_file_name = dataset_dtls["dataset_folder_path"] + dataset_dtls["rtable_file_name"]

    #Assumption: key name is always "id"
    A = em.read_csv_metadata(l_file_name , key="id", encoding='utf-8')
    B = em.read_csv_metadata(r_file_name , key="id", encoding='utf-8')

    return A, B
コード例 #12
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     self.C['pos_trig_labels'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     self.mt = em.MatchTrigger()
コード例 #13
0
ファイル: pipeline.py プロジェクト: is4ac/cs839-data-science
def main():
    A = em.read_csv_metadata('../Data/A_imdb.csv', key='id')
    B = em.read_csv_metadata('../Data/B_tmdb.csv', key='id')
    ab = em.AttrEquivalenceBlocker()
    shared_attributes = ['title', 'directors', 'release_year', 'languages']
    C = ab.block_tables(A,
                        B,
                        'directors',
                        'directors',
                        l_output_attrs=shared_attributes,
                        r_output_attrs=shared_attributes)
    # Take a sample of 10 pairs
    S = em.sample_table(C, 100)
    print(S)
    G = em.label_table(S, label_column_name='gold_labels')
    train_test = em.split_train_test(G, train_proportion=0.5)
    train, test = train_test['train'], train_test['test']
    # Get feature for matching
    match_f = em.get_features_for_matching(A, B)
    H = em.extract_feature_vecs(train,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    H.fillna(value=0, inplace=True)
    print(H)
    # Specifying Matchers and Performing Matching.
    dt = em.DTMatcher(max_depth=5)  # A decision tree matcher.
    # Train the matcher
    dt.fit(table=H,
           exclude_attrs=[
               '_id', 'ltable_id', 'rtable_id', 'ltable_title', 'rtable_title',
               'gold_labels'
           ],
           target_attr='gold_labels')
    # Predict
    F = em.extract_feature_vecs(test,
                                attrs_before=['ltable_title', 'rtable_title'],
                                feature_table=match_f,
                                attrs_after=['gold_labels'])
    F.fillna(value=0, inplace=True)
    print(F)
    pred_table = dt.predict(table=F,
                            exclude_attrs=[
                                '_id', 'ltable_id', 'rtable_id',
                                'ltable_title', 'rtable_title', 'gold_labels'
                            ],
                            target_attr='predicted_labels',
                            return_probs=True,
                            probs_attr='proba',
                            append=True,
                            inplace=True)
    print(pred_table)
    eval_summary = em.eval_matches(pred_table, 'gold_labels',
                                   'predicted_labels')
    em.print_eval_summary(eval_summary)
コード例 #14
0
 def setup(self):
     path_for_a = os.sep.join([DATASET_PATH, 'books', 'A.csv'])
     path_for_b = os.sep.join([DATASET_PATH, 'books', 'B.csv'])
     try:
         self.A = em.read_csv_metadata(path_for_a)
         self.B = em.read_csv_metadata(path_for_b)
         self.size = 2000
         self.y_param = 2
     except AssertionError:
         print("Dataset \'books\' not found. Please visit the project website to download the dataset.")
         raise SystemExit
コード例 #15
0
 def test_ob_block_tables_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
コード例 #16
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'citations', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'citations', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #17
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'beer', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'beer', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'Label')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'Label')
         bb.set_black_box_function(_beer_function)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #18
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #19
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #20
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
     self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'restaurants\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #21
0
 def test_ob_block_tuples_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     assert_equal(self.ob.block_tuples(A.ix[1], B.ix[3], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
     assert_equal(self.ob.block_tuples(A.ix[3], B.ix[2], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
     assert_equal(self.ob.block_tuples(A.ix[3], B.ix[3], l_overlap_attr_1,
                                       r_overlap_attr_1), True)
コード例 #22
0
 def test_ab_block_tables_wi_missing_values_allow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1,
                              l_output_attrs, r_output_attrs,
                              l_output_prefix, r_output_prefix, True)
     validate_metadata(C, l_output_attrs, r_output_attrs,
                       l_output_prefix, r_output_prefix)
     validate_data(C, expected_ids_3)
コード例 #23
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.C = em.read_csv_metadata(path_for_C, ltable=self.A, rtable=self.B)
     self.feature_table = em.get_features_for_matching(
         self.A, self.B, validate_inferred_attr_types=False)
     self.C['neg_trig_labels'] = [
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
     ]
     self.C['pos_trig_labels'] = [
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     ]
     self.mt = em.MatchTrigger()
コード例 #24
0
 def test_ab_block_candset_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2)
     validate_metadata_two_candsets(C, D)
     validate_data(D, [('a5','b5')])
コード例 #25
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
     self.l_block_attr = 'PHONENUMBER'
     self.r_block_attr = 'PHONENUMBER'
     self.l_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     self.r_output_attrs = ['NAME', 'PHONENUMBER', 'ADDRESS']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'restaurants\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #26
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     self.l_block_attr = 'Year'
     self.r_block_attr = 'Year'
     self.l_output_attrs = ['Title', 'Year', 'Episodes']
     self.r_output_attrs = ['Title', 'Year', 'Episodes']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #27
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
     self.l_block_attr = 'Year'
     self.r_block_attr = 'Year'
     self.l_output_attrs = ['Title', 'Year', 'Episodes']
     self.r_output_attrs = ['Title', 'Year', 'Episodes']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'anime\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #28
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     self.l_block_attr = 'Brand'
     self.r_block_attr = 'Brand'
     self.l_output_attrs = ['Brand', 'Amazon_Price']
     self.r_output_attrs = ['Brand', 'Price']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #29
0
 def test_ob_block_tables_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1, r_overlap_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
コード例 #30
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     self.l_block_attr = 'Brand'
     self.r_block_attr = 'Brand'
     self.l_output_attrs = ['Brand', 'Amazon_Price']
     self.r_output_attrs = ['Brand', 'Price']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'electronics\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #31
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
        path_for_C = os.sep.join([datasets_path, 'books', 'C.csv'])

        try:
            self.A = em.read_csv_metadata(path_for_A)
            em.set_key(self.A, 'ID')
            self.B = em.read_csv_metadata(path_for_B)
            em.set_key(self.B, 'ID')
            self.C = em.read_csv_metadata(path_for_C)
            cm.set_candset_properties(self.C, '_id', 'ltable_ID', 'rtable_ID',
                                      self.A, self.B)
        except AssertionError:
            print("Dataset \'books\' not found. Please visit the project"
                  " website to download the dataset.")
コード例 #32
0
 def test_ob_block_candset_wi_missing_vals_disallow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ob.block_tables(A, B, l_overlap_attr_1,
                              r_overlap_attr_1, allow_missing=True)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ob.block_candset(C, l_overlap_attr_2, r_overlap_attr_2,
                               rem_stop_words=True, overlap_size=4)
     validate_metadata_two_candsets(C, D)
     validate_data(D, expected_ids_2)
コード例 #33
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            self.C = snb.block_tables(A, B, 'Author', 'Author',
                                     ['Title', 'Author', 'ISBN13', 'Publisher'],
                                     ['Title', 'Author', 'ISBN13', 'Publisher'])
        except AssertionError:
            print("Dataset \'books\' not found. Please visit the project"
                  " website to download the dataset.")        
            raise SystemExit

        self.l_block_attr = 'ISBN13'
        self.r_block_attr = 'ISBN13'
コード例 #34
0
ファイル: data_loader.py プロジェクト: chu-data-lab/zeroer
def load_data(left_file_name,
              right_file_name,
              label_file_name,
              blocking_fn,
              include_self_join=False):
    A = em.read_csv_metadata(left_file_name, key="id", encoding='iso-8859-1')
    B = em.read_csv_metadata(right_file_name, key="id", encoding='iso-8859-1')
    try:
        G = pd.read_csv(label_file_name)
    except:
        G = None
    C = blocking_fn(A, B)
    if include_self_join:
        C_A = blocking_fn(A, A)
        C_B = blocking_fn(B, B)
        return A, B, G, C, C_A, C_B
    else:
        return A, B, G, C
コード例 #35
0
 def setup(self):
     p = mg.get_install_path()
     path_for_A = os.sep.join([datasets_path, 'electronics', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'electronics', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'ID')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'ID')
         self.C = ab.block_tables(A, B, 'Brand', 'Brand',
                                  ['Brand', 'Amazon_Price'],
                                  ['Brand', 'Price'])
         bb.set_black_box_function(_electronics_function)
     except AssertionError:
         print(
             "Dataset \'electronics\' not found. Please visit the project "
             "website to download the dataset.")
         raise SystemExit
コード例 #36
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'anime', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'anime', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            self.C = ab.block_tables(A, B, 'Year', 'Year',
                                     ['Title', 'Year', 'Episodes'],
                                     ['Title', 'Year', 'Episodes'])
        except AssertionError:
            print("Dataset \'anime\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit

        self.l_block_attr = 'Episodes'
        self.r_block_attr = 'Episodes'
コード例 #37
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
     self.l_block_attr = 'Author'
     self.r_block_attr = 'Author'
     self.l_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher',
                            'Publication_Date']
     self.r_output_attrs = ['Title', 'Author', 'ISBN13', 'Publisher',
                            'Publication_Date']
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'books\' not found. Please visit the project"
               " website to download the dataset.")        
         raise SystemExit
コード例 #38
0
def read_files():
    # Read in data files
    A = em.read_csv_metadata(FOLDER + 'A.csv', key='id')  # imdb data
    B = em.read_csv_metadata(FOLDER + 'B.csv', key='id')  # tmdb data
    C = em.read_csv_metadata(
        FOLDER + 'C.csv',
        key='_id',
        ltable=A,
        rtable=B,
        fk_ltable='l_id',
        fk_rtable='r_id')  # candidates that survive blocking step
    G = em.read_csv_metadata(FOLDER + 'G.csv',
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='l_id',
                             fk_rtable='r_id')  # labeled data

    return A, B, C, G
コード例 #39
0
 def test_ab_block_tables_wi_missing_values_allow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1,
                              l_output_attrs, r_output_attrs,
                              l_output_prefix, r_output_prefix, True)
     validate_metadata(C, l_output_attrs, r_output_attrs, l_output_prefix,
                       r_output_prefix)
     validate_data(C, expected_ids_3)
コード例 #40
0
 def test_ab_block_candset_wi_missing_values_disallow_missing(self):
     path_a = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_A_wi_missing_vals.csv'
     ])
     path_b = os.sep.join([
         p, 'tests', 'test_datasets', 'blocker',
         'table_B_wi_missing_vals.csv'
     ])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     C = self.ab.block_tables(A, B, l_block_attr_1, r_block_attr_1)
     validate_metadata(C)
     validate_data(C, expected_ids_4)
     D = self.ab.block_candset(C, l_block_attr_2, r_block_attr_2)
     validate_metadata_two_candsets(C, D)
     validate_data(D, [('a5', 'b5')])
コード例 #41
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            self.C = snb.block_tables(
                A, B, 'Author', 'Author',
                ['Title', 'Author', 'ISBN13', 'Publisher'],
                ['Title', 'Author', 'ISBN13', 'Publisher'])
        except AssertionError:
            print("Dataset \'books\' not found. Please visit the project"
                  " website to download the dataset.")
            raise SystemExit

        self.l_block_attr = 'ISBN13'
        self.r_block_attr = 'ISBN13'
コード例 #42
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            ob = mg.OverlapBlocker()
            self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4,
			         l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'],
                                 r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'])
            feature_table = mg.get_features_for_blocking(A,B)
            self.rb = mg.RuleBasedBlocker()
            self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'],
                         feature_table)
        except AssertionError:
            print("Dataset \'beer\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit
コード例 #43
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
     self.l_block_attr = 'Author'
     self.r_block_attr = 'Author'
     self.l_output_attrs = [
         'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'
     ]
     self.r_output_attrs = [
         'Title', 'Author', 'ISBN13', 'Publisher', 'Publication_Date'
     ]
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
     except AssertionError:
         print("Dataset \'books\' not found. Please visit the project"
               " website to download the dataset.")
         raise SystemExit
コード例 #44
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'record_id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'record_id')
         ob = mg.OverlapBlocker()
         self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2,
                              rem_stop_words = True,
                              l_output_attrs=['title', 'author', 'publisher', 'date'],
                              r_output_attrs=['title', 'author', 'publisher', 'date'])
         feature_table = mg.get_features_for_blocking(A,B)
         self.rb = mg.RuleBasedBlocker()
         self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
コード例 #45
0
    def setup(self):
        p = mg.get_install_path()
        path_for_A = os.sep.join([datasets_path, 'bikes', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'bikes', 'B.csv'])
        try:
            self.A = mg.read_csv_metadata(path_for_A)
            mg.set_key(self.A, 'id')
            self.B = mg.read_csv_metadata(path_for_B)
            mg.set_key(self.B, 'id')
        except AssertionError:
            print("Dataset \'bikes\' not found. Please visit the project"
                  " website to download the dataset.")        
            raise SystemExit

        self.l_block_attr = 'city_posted'
        self.r_block_attr = 'city_posted'
        self.l_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']
        self.r_output_attrs = ['bike_name', 'city_posted', 'km_driven', 'price',
                               'color', 'model_year']
コード例 #46
0
 def test_ab_block_tuples_wi_missing_values_allow_missing(self):
     path_a = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_A_wi_missing_vals.csv'])
     path_b = os.sep.join([p, 'tests', 'test_datasets', 'blocker',
                           'table_B_wi_missing_vals.csv'])
     A = em.read_csv_metadata(path_a)
     em.set_key(A, 'ID')
     B = em.read_csv_metadata(path_b)
     em.set_key(B, 'ID')
     assert_equal(self.ab.block_tuples(A.ix[0], B.ix[0], l_block_attr_1,
                                       r_block_attr_1, allow_missing=True),
                  False)
     assert_equal(self.ab.block_tuples(A.ix[1], B.ix[2], l_block_attr_1,
                                       r_block_attr_1, allow_missing=True),
                  False)
     assert_equal(self.ab.block_tuples(A.ix[2], B.ix[1], l_block_attr_1,
                                       r_block_attr_1, allow_missing=True),
                  False)
     assert_equal(self.ab.block_tuples(A.ix[0], B.ix[1], l_block_attr_1,
                                       r_block_attr_1, allow_missing=True),
                  False)
     assert_equal(self.ab.block_tuples(A.ix[2], B.ix[2], l_block_attr_1,
                                       r_block_attr_1, allow_missing=True),
                  True)
コード例 #47
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_a)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_b)
     em.set_key(self.B, 'ID')
     self.ob = em.OverlapBlocker()
コード例 #48
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_a)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_b)
     em.set_key(self.B, 'ID')
     self.ab = em.AttrEquivalenceBlocker()
コード例 #49
0
 def setUp(self):
     self.A = em.read_csv_metadata(path_a)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_b)
     em.set_key(self.B, 'ID')
     self.bb = em.BlackBoxBlocker()