def test_save_load_feature_table(): #mg.init_jvm() filename = '__mg_obj__.pkl' A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') feature_table0 = mg.get_features_for_blocking(A, B) mg.save_object(feature_table0, filename) feature_table1 = mg.load_object(filename) try: os.remove(filename) except OSError: pass assert_equal(type(feature_table0), type(feature_table1)) assert_equal(len(feature_table0), len(feature_table1)) assert sorted(feature_table0.columns) == sorted(feature_table1.columns) ft0_functions = list(feature_table0['function']) ft1_functions = list(feature_table1['function']) for f0, f1 in zip(ft0_functions, ft1_functions): a = f0(A.ix[1], B.ix[2]) b = f1(A.ix[1], B.ix[2]) if math.isnan(a) == False and math.isnan(b) == False: assert_equal(a, b) if math.isnan(a) == True: assert_equal(math.isnan(b), True)
def test_rb_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule(['name_name_mel(ltuple, rtuple) < 0.4', 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'], feature_table) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table) C = rb.block_tables(A, B, 'zipcode', 'zipcode') s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') A['dummy'] = 1 B['dummy'] = 1 ab = mg.AttrEquivalenceBlocker() D = ab.block_tables(A, B, 'dummy','dummy') fv = mg.extract_feature_vecs(D, feature_table=feature_table) expected_ids = fv.ix[((fv.name_name_mel >= 0.4) | (fv.birth_year_birth_year_lev >= 0.5)) & fv.zipcode_zipcode_exm == 1 , ['ltable.ID', 'rtable.ID', ]] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_ab_block_tables_skd(): start_time = time.time() #A = mg.load_dataset('bikedekho_clean', 'ID') A = mg.load_dataset('bowker', 'ID') a_load_time = time.time() print("Loading table A --- %s seconds ---" % (a_load_time - start_time)) #B = mg.load_dataset('bikewale_clean', 'ID') B = mg.load_dataset('walmart', 'ID') b_load_time = time.time() print("Loading table B --- %s seconds ---" % (b_load_time - a_load_time)) ab = mg.AttrEquivalenceBlocker() ab_time = time.time() print("Created an AE blocker --- %s seconds ---" % (ab_time - b_load_time)) #C = ab.block_tables_skd(A, B, 'city_posted', 'city_posted', 'city_posted', 'city_posted') C = ab.block_tables(A, B, 'pubYear', 'pubYear', 'pubYear', 'pubYear') #C = ab.block_tables_skd(A, B, 'isbn', 'isbn', 'isbn', 'isbn') print("Size of candset C: %d" % (len(C))) c_time = time.time() print("Block tables --- %s seconds ---" % (c_time - ab_time)) #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.city_posted', 'rtable.city_posted']) s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.pubYear', 'rtable.pubYear']) #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.isbn', 'rtable.isbn']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') #k1 = np.array(C[['ltable.city_posted']]) k1 = np.array(C[['ltable.pubYear']]) #k1 = np.array(C[['ltable.isbn']]) #k2 = np.array(C[['rtable.city_posted']]) k2 = np.array(C[['rtable.pubYear']]) #k2 = np.array(C[['rtable.isbn']]) assert_equal(all(k1 == k2), True)
def test_bb_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_tables(A, B, 'zipcode', 'zipcode') s1 = sorted( ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') feature_table = mg.get_features_for_blocking(A, B) A['dummy'] = 1 B['dummy'] = 1 ab = mg.AttrEquivalenceBlocker() D = ab.block_tables(A, B, 'dummy', 'dummy') fv = mg.extract_feature_vecs(D, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list( expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list( actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_ab_block_candset_skd(): #A = mg.load_dataset('table_A') A = mg.load_dataset('bikedekho_clean', 'ID') #B = mg.load_dataset('table_B') B = mg.load_dataset('bikewale_clean', 'ID') ab = mg.AttrEquivalenceBlocker() #C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['zipcode', 'birth_year'], ['zipcode', 'birth_year']) C = ab.block_tables_opt(A, B, 'city_posted', 'city_posted', ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'], ['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year']) print "Size of C: ", len(C) #D = ab.block_candset_skd(C, 'birth_year', 'birth_year') D = ab.block_candset_joblib(C, 'model_year', 'model_year') print "Size of D: ", len(D) #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'ltable.birth_year', 'rtable.zipcode', # 'rtable.birth_year']) s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.bike_name', 'ltable.city_posted', 'ltable.km_driven', 'ltable.price', 'ltable.color', 'ltable.model_year', 'rtable.bike_name', 'rtable.city_posted', 'rtable.km_driven', 'rtable.price', 'rtable.color', 'rtable.model_year']) assert_equal(s1, sorted(D.columns)) assert_equal(D.get_key(), '_id') assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID') #k1 = np.array(D[['ltable.birth_year']]) k1 = np.array(D[['ltable.model_year']]) #k2 = np.array(D[['rtable.birth_year']]) k2 = np.array(D[['rtable.model_year']]) assert_equal(all(k1 == k2), True)
def test_save_load_rb_blocker(): #mg.init_jvm() filename = '__mg_obj__.pkl' A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') feature_table = mg.get_features_for_blocking(A, B) rb0 = mg.RuleBasedBlocker() rb0.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) == 1'], feature_table) rb0.add_rule(['birth_year_birth_year_anm(ltuple, rtuple) > 0.95', 'name_name_mel(ltuple, rtuple)> 0.4'], feature_table) C0 = rb0.block_tables(A, B) mg.save_object(rb0, filename) rb1 = mg.load_object(filename) try: os.remove(filename) except OSError: pass assert_equal(type(rb0), type(rb1)) assert_equal(len(rb0.rules), len(rb1.rules)) assert_equal(len(rb0.rule_source), len(rb1.rule_source)) assert_equal(rb0.rule_cnt, rb1.rule_cnt) C1 = rb1.block_tables(A, B) assert_equal(len(C0), len(C1)) assert_equal(sorted(C0.columns), sorted(C0.columns))
def test_save_load_rb_blocker(): #mg.init_jvm() filename = '__mg_obj__.pkl' A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') feature_table = mg.get_features_for_blocking(A, B) rb0 = mg.RuleBasedBlocker() rb0.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) == 1'], feature_table) rb0.add_rule([ 'birth_year_birth_year_anm(ltuple, rtuple) > 0.95', 'name_name_mel(ltuple, rtuple)> 0.4' ], feature_table) C0 = rb0.block_tables(A, B) mg.save_object(rb0, filename) rb1 = mg.load_object(filename) try: os.remove(filename) except OSError: pass assert_equal(type(rb0), type(rb1)) assert_equal(len(rb0.rules), len(rb1.rules)) assert_equal(len(rb0.rule_source), len(rb1.rule_source)) assert_equal(rb0.rule_cnt, rb1.rule_cnt) C1 = rb1.block_tables(A, B) assert_equal(len(C0), len(C1)) assert_equal(sorted(C0.columns), sorted(C0.columns))
def test_save_load_bb_blocker(): #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib') from magellan.feature.simfunctions import jaccard from magellan.feature.tokenizers import tok_qgram def block_fn_1(ltuple, rtuple): val = jaccard(tok_qgram(ltuple['address'], 3), tok_qgram(rtuple['address'], 3)) if val < 0.4: return True else: return False bb0 = mg.BlackBoxBlocker() bb0.set_black_box_function(block_fn_1) filename = '__mg_obj__.pkl' A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') C0 = bb0.block_tables(A, B) mg.save_object(bb0, filename) bb1 = mg.load_object(filename) try: os.remove(filename) except OSError: pass assert_equal(type(bb0), type(bb1)) C1 = bb1.block_tables(A, B) assert_equal(len(C0), len(C1)) assert_equal(sorted(C0.columns), sorted(C0.columns))
def test_ab_block_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() assert_equal(ab.block_tuples(A.ix[1], B.ix[2], 'zipcode', 'zipcode'), False) assert_equal(ab.block_tuples(A.ix[2], B.ix[2], 'zipcode', 'zipcode'), True)
def test_rb_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule([ 'name_name_mel(ltuple, rtuple) < 0.4', 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5' ], feature_table) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table) C = rb.block_tables(A, B, 'zipcode', 'zipcode') s1 = sorted( ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') A['dummy'] = 1 B['dummy'] = 1 ab = mg.AttrEquivalenceBlocker() D = ab.block_tables(A, B, 'dummy', 'dummy') fv = mg.extract_feature_vecs(D, feature_table=feature_table) expected_ids = fv.ix[((fv.name_name_mel >= 0.4) | (fv.birth_year_birth_year_lev >= 0.5)) & fv.zipcode_zipcode_exm == 1, [ 'ltable.ID', 'rtable.ID', ]] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list( expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list( actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) assert_equal(bb.block_tuples(A.ix[0], B.ix[0]), True) assert_equal(bb.block_tuples(A.ix[2], B.ix[1]), False)
def test_ab_block_tables_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'name', 'name') assert_equal(len(C), 0) assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_tables_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(evil_block_fn) C = bb.block_tables(A, B) assert_equal(len(C), 0) assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_rb_block_tables_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'], feature_table) C = rb.block_tables(A, B) assert_equal(len(C), 0) assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_rb_block_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule(['name_name_mel(ltuple, rtuple) < 0.4', 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'], feature_table) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table) assert_equal(rb.block_tuples(A.ix[0], B.ix[0]), False) assert_equal(rb.block_tuples(A.ix[1], B.ix[1]), True)
def test_bb_block_candset_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'birth_year', 'birth_year') bb = mg.BlackBoxBlocker() bb.set_black_box_function(evil_block_fn) D = bb.block_candset(C) assert_equal(len(D), 0) assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(D.get_key(), '_id') assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_ab_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', 'zipcode', 'zipcode') s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') k1 = np.array(C[['ltable.zipcode']]) k2 = np.array(C[['rtable.zipcode']]) assert_equal(all(k1 == k2), True)
def test_rb_block_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule([ 'name_name_mel(ltuple, rtuple) < 0.4', 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5' ], feature_table) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table) assert_equal(rb.block_tuples(A.ix[0], B.ix[0]), False) assert_equal(rb.block_tuples(A.ix[1], B.ix[1]), True)
def test_ab_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'zipcode', 'zipcode', 'zipcode', 'zipcode') s1 = sorted( ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') k1 = np.array(C[['ltable.zipcode']]) k2 = np.array(C[['rtable.zipcode']]) assert_equal(all(k1 == k2), True)
def test_rb_block_candset_wi_no_tuples(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() C = ab.block_tables(A, B, 'birth_year', 'birth_year') rb = mg.RuleBasedBlocker() feature_table = mg.get_features_for_blocking(A, B) rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'], feature_table) D = rb.block_candset(C) assert_equal(len(D), 0) assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID'])) assert_equal(D.get_key(), '_id') assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_candset(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() E = ab.block_tables(A, B, 'zipcode', 'zipcode') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_candset(E) feature_table = mg.get_features_for_blocking(A, B) fv = mg.extract_feature_vecs(C, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4) , ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_candset(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') ab = mg.AttrEquivalenceBlocker() E = ab.block_tables(A, B, 'zipcode', 'zipcode') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_candset(E) feature_table = mg.get_features_for_blocking(A, B) fv = mg.extract_feature_vecs(C, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list( expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list( actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tables(): A = mg.load_dataset('table_A') B = mg.load_dataset('table_B') bb = mg.BlackBoxBlocker() bb.set_black_box_function(block_fn) C = bb.block_tables(A, B, 'zipcode', 'zipcode') s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode']) assert_equal(s1, sorted(C.columns)) assert_equal(C.get_key(), '_id') assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID') assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID') feature_table = mg.get_features_for_blocking(A, B) A['dummy'] = 1 B['dummy'] = 1 ab = mg.AttrEquivalenceBlocker() D = ab.block_tables(A, B, 'dummy','dummy') fv = mg.extract_feature_vecs(D, feature_table=feature_table) expected_ids = fv.ix[(fv.name_name_mel >= 0.4) , ['ltable.ID', 'rtable.ID']] actual_ids = C[['ltable.ID', 'rtable.ID']] ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values) assert_equal(cmp(ids_exp, ids_act), 0)