Exemple #1
0
 def time_block_tables(self):
     rb = mg.RuleBasedBlocker()
     rb.add_rule(['movie_name_movie_name_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.6'],
                 self.feature_table)
     rb.block_tables(self.A, self.B,
                     ['movie_name','year','directors','actors','critic_rating','genre','pg_rating','duration'],
                     ['movie_name','year','directors','actors','movie_rating','genre','duration'])
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B, validate_inferred_attr_types=False)
     self.rb = em.RuleBasedBlocker()
 def setUp(self):
     self.A = em.read_csv_metadata(path_for_A)
     em.set_key(self.A, 'ID')
     self.B = em.read_csv_metadata(path_for_B)
     em.set_key(self.B, 'ID')
     self.feature_table = em.get_features_for_blocking(self.A, self.B)
     self.rb = em.RuleBasedBlocker()
 def test_rb_block_candset_empty_output_njobs_2(self):
     rb = em.RuleBasedBlocker()
     rb.add_rule(rule_1, self.feature_table)
     C = rb.block_tables(self.A, self.B)
     validate_metadata(C)
     validate_data(C, expected_ids_1)
     self.rb.add_rule(rule_5, self.feature_table)
     D = self.rb.block_candset(C, n_jobs=2)
     validate_metadata_two_candsets(C, D)
     validate_data(D)
 def test_rb_block_candset_empty_input(self):
     rb = em.RuleBasedBlocker()
     rb.add_rule(rule_5, self.feature_table)
     C = rb.block_tables(self.A, self.B)
     validate_metadata(C)
     validate_data(C)
     self.rb.add_rule(rule_1, self.feature_table)
     D = self.rb.block_candset(C)
     validate_metadata_two_candsets(C, D)
     validate_data(D)
 def test_rb_block_candset_njobs_2(self):
     rb = em.RuleBasedBlocker()
     rb.add_rule(rule_1, self.feature_table)
     C = rb.block_tables(self.A, self.B, l_output_attrs,
                         r_output_attrs, l_output_prefix, r_output_prefix)
     validate_metadata(C, l_output_attrs, r_output_attrs,
                            l_output_prefix, r_output_prefix)
     validate_data(C, expected_ids_1)
     self.rb.add_rule(rule_2, self.feature_table)
     D = self.rb.block_candset(C, n_jobs=2)
     validate_metadata_two_candsets(C, D)
     validate_data(D, expected_ids_1_and_2)
Exemple #7
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'books', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'books', 'B.csv'])
     try:
         self.A = mg.read_csv_metadata(path_for_A)
         mg.set_key(self.A, 'ID')
         self.B = mg.read_csv_metadata(path_for_B)
         mg.set_key(self.B, 'ID')
         self.feature_table = mg.get_features_for_blocking(self.A, self.B)
         self.rb = mg.RuleBasedBlocker()
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
Exemple #8
0
 def setup(self):
     path_for_A = os.sep.join([datasets_path, 'ebooks', 'A.csv'])
     path_for_B = os.sep.join([datasets_path, 'ebooks', 'B.csv'])
     try:
         A = mg.read_csv_metadata(path_for_A)
         mg.set_key(A, 'record_id')
         B = mg.read_csv_metadata(path_for_B)
         mg.set_key(B, 'record_id')
         ob = mg.OverlapBlocker()
         self.C = ob.block_tables(A, B, 'title', 'title', overlap_size=2,
                              rem_stop_words = True,
                              l_output_attrs=['title', 'author', 'publisher', 'date'],
                              r_output_attrs=['title', 'author', 'publisher', 'date'])
         feature_table = mg.get_features_for_blocking(A,B)
         self.rb = mg.RuleBasedBlocker()
         self.rb.add_rule(['date_date_lev_sim(ltuple, rtuple) < 0.6'], feature_table)
     except AssertionError:
         print("Dataset \'beer\' not found. Please visit the project "
               "website to download the dataset.")
         raise SystemExit
Exemple #9
0
    def setup(self):
        path_for_A = os.sep.join([datasets_path, 'restaurants', 'A.csv'])
        path_for_B = os.sep.join([datasets_path, 'restaurants', 'B.csv'])
        try:
            A = mg.read_csv_metadata(path_for_A)
            mg.set_key(A, 'ID')
            B = mg.read_csv_metadata(path_for_B)
            mg.set_key(B, 'ID')
            ob = mg.OverlapBlocker()
            self.C = ob.block_tables(A, B, 'ADDRESS', 'ADDRESS', overlap_size=4,
			         l_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'],
                                 r_output_attrs=['NAME', 'PHONENUMBER', 'ADDRESS'])
            feature_table = mg.get_features_for_blocking(A,B)
            self.rb = mg.RuleBasedBlocker()
            self.rb.add_rule(['ADDRESS_ADDRESS_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.44'],
                         feature_table)
        except AssertionError:
            print("Dataset \'beer\' not found. Please visit the project "
                  "website to download the dataset.")
            raise SystemExit
# In[41]:

A

# In[79]:

block_f = em.get_features_for_blocking(A, B)

# In[80]:

block_f

# In[81]:

rb1 = em.RuleBasedBlocker()
rule1 = 'name_name_lev_sim(ltuple,rtuple) < 0.8'
rb1.add_rule(rule1, block_f)

rb2 = em.RuleBasedBlocker()
rule2 = 'address_address_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.9'
rb2.add_rule(rule2, block_f)

# In[63]:

C1 = rb1.block_tables(A,
                      B,
                      l_output_attrs=[
                          'ID', 'name', 'address', 'ratingValue',
                          'price_range', 'number_of_reviews'
                      ],
 def test_rb_block_candset_no_rules(self):
     rb = em.RuleBasedBlocker()
     rb.add_rule(rule_1, self.feature_table)
     C = rb.block_tables(self.A, self.B, show_progress=False)
     self.rb.block_candset(C)
                         'Directed By', 'Written By', 'Studio'
                     ],
                     r_output_attrs=[
                         'Title', 'Genre', 'Score', 'Release Date', 'Rating',
                         'Directed By', 'Written By', 'Studio'
                     ])

# Combine the outputs from attr. equivalence blocker and overlap blocker
# union because if there is an error in the release date, at least the movies should have their names in common
D = em.combine_blocker_outputs_via_union([C1, C2, C3])

# Rule based blocker after D
block_f = em.get_features_for_blocking(A,
                                       B,
                                       validate_inferred_attr_types=False)
rb = em.RuleBasedBlocker()
# print(block_f)
rb.add_rule(['Title_Title_lev_sim(ltuple, rtuple) < 0.4'], block_f)
C = rb.block_candset(D, show_progress=False)
print('Candidate Match set C Size: ', len(C))
print('Finish Blocking stage')

################################## Matcher Portion ##################################
# Open up out labeled data from the last Project.
path_G = '../data/G.csv'
G = em.read_csv_metadata(path_G,
                         key='_id',
                         ltable=A,
                         rtable=B,
                         fk_ltable='ltable_ID',
                         fk_rtable='rtable_ID')
def workflow(path_A, path_B, path_labeled):

    # Load csv files as dataframes and set the key attribute in the dataframe
    A = em.read_csv_metadata(path_A, key='ID')
    B = em.read_csv_metadata(path_B, key='ID')

    # Run attribute equivalence blocker on brand
    ab = em.AttrEquivalenceBlocker()
    C1 = ab.block_tables(A,
                         B,
                         'Brand',
                         'Brand',
                         l_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ],
                         r_output_attrs=[
                             'Name', 'Price', 'Brand', 'Screen Size', 'RAM',
                             'Hard Drive Capacity', 'Processor Type',
                             'Processor Speed', 'Operating System',
                             'Clean Name'
                         ])

    # Get features for rule based blocking
    block_f = em.get_features_for_blocking(A,
                                           B,
                                           validate_inferred_attr_types=False)

    # Run rule based blocker with rule for jaccard score on Clean Name column
    rb = em.RuleBasedBlocker()
    rb.add_rule(
        ['Clean_Name_Clean_Name_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.2'],
        block_f)
    C2 = rb.block_candset(C1)

    # Run black box blocker to compare screen size, ram, and hard drive capacity
    bb_screen = em.BlackBoxBlocker()
    bb_screen.set_black_box_function((screen_ram_hd_equal))
    C = bb_screen.block_candset(C2)

    # Load the labeled data
    L = em.read_csv_metadata(path_labeled,
                             key='_id',
                             ltable=A,
                             rtable=B,
                             fk_ltable='ltable_ID',
                             fk_rtable='rtable_ID')

    # Generate features
    feature_table = em.get_features_for_matching(
        A, B, validate_inferred_attr_types=False)
    feature_subset = feature_table.iloc[np.r_[4:10, 40:len(feature_table)], :]
    em.add_blackbox_feature(feature_subset, 'refurbished', refurbished)

    # Extract feature vectors
    feature_vectors_dev = em.extract_feature_vecs(L,
                                                  feature_table=feature_subset,
                                                  attrs_after='gold')

    # Impute feature vectors with the mean of the column values.
    feature_vectors_dev = em.impute_table(
        feature_vectors_dev,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
        strategy='mean')

    # Train using feature vectors from the labeled data
    matcher = em.RFMatcher(name='RF')
    matcher.fit(table=feature_vectors_dev,
                exclude_attrs=['_id', 'ltable_ID', 'rtable_ID', 'gold'],
                target_attr='gold')

    # Extract feature vectors for the rest of the data
    feature_vectors = em.extract_feature_vecs(C, feature_table=feature_subset)

    # Impute feature vectors with the mean of the column values.
    feature_vectors = em.impute_table(
        feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        strategy='mean')

    # Make predictions for the whole data set
    predictions = matcher.predict(
        table=feature_vectors,
        exclude_attrs=['_id', 'ltable_ID', 'rtable_ID'],
        append=True,
        target_attr='predicted',
        inplace=False)
    predictions = predictions.loc[:, [
        '_id', 'ltable_ID', 'rtable_ID', 'predicted'
    ]]

    return predictions[predictions['predicted'] == 1]
Exemple #14
0
 def time_block_tables(self):
     rb = mg.RuleBasedBlocker()
     rb.add_rule(['Title_Title_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.4'],
                 self.feature_table)
     rb.block_tables(self.A, self.B, ['Title'], ['Title'])
Exemple #15
0
 def time_block_tables_features_jac(self):
     rb = mg.RuleBasedBlocker()
     rb.add_rule(['Features_Features_jac_qgm_3_qgm_3(ltuple,rtuple) < 0.6'],
                 self.feature_table)
     rb.block_tables(self.A, self.B, ['Features'], ['Features'])
Exemple #16
0
 def time_block_tables_name_cos(self):
     rb = mg.RuleBasedBlocker()
     rb.add_rule(['Name_Name_cos_dlm_dc0_dlm_dc0(ltuple,rtuple) < 0.3'],
                 self.feature_table)
     rb.block_tables(self.A, self.B, ['Name'], ['Name'])