Example #1
0
def test_save_load_feature_table():
    #mg.init_jvm()

    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    feature_table0 = mg.get_features_for_blocking(A, B)
    mg.save_object(feature_table0, filename)
    feature_table1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass
    assert_equal(type(feature_table0), type(feature_table1))
    assert_equal(len(feature_table0), len(feature_table1))
    assert sorted(feature_table0.columns) == sorted(feature_table1.columns)
    ft0_functions = list(feature_table0['function'])
    ft1_functions = list(feature_table1['function'])
    for f0, f1 in zip(ft0_functions, ft1_functions):
        a = f0(A.ix[1], B.ix[2])
        b = f1(A.ix[1], B.ix[2])
        if math.isnan(a) == False and math.isnan(b) == False:
            assert_equal(a, b)
        if math.isnan(a) == True:
            assert_equal(math.isnan(b), True)
def test_rb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['name_name_mel(ltuple, rtuple) < 0.4',
                 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'],
                feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'],
                feature_table)
    C = rb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy','dummy')
    fv = mg.extract_feature_vecs(D,  feature_table=feature_table)
    expected_ids = fv.ix[((fv.name_name_mel >= 0.4) | (fv.birth_year_birth_year_lev >= 0.5)) &
      fv.zipcode_zipcode_exm == 1 ,
      ['ltable.ID', 'rtable.ID',
       ]]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_ab_block_tables_skd():
    start_time = time.time()
    #A = mg.load_dataset('bikedekho_clean', 'ID')
    A = mg.load_dataset('bowker', 'ID')
    a_load_time = time.time()
    print("Loading table A --- %s seconds ---" % (a_load_time - start_time))
    #B = mg.load_dataset('bikewale_clean', 'ID')
    B = mg.load_dataset('walmart', 'ID')
    b_load_time = time.time()
    print("Loading table B --- %s seconds ---" % (b_load_time - a_load_time))
    ab = mg.AttrEquivalenceBlocker()
    ab_time = time.time()
    print("Created an AE blocker --- %s seconds ---" % (ab_time - b_load_time))
    #C = ab.block_tables_skd(A, B, 'city_posted', 'city_posted', 'city_posted', 'city_posted')
    C = ab.block_tables(A, B, 'pubYear', 'pubYear', 'pubYear', 'pubYear')
    #C = ab.block_tables_skd(A, B, 'isbn', 'isbn', 'isbn', 'isbn')
    print("Size of candset C: %d" % (len(C)))
    c_time = time.time()
    print("Block tables --- %s seconds ---" % (c_time - ab_time))

    #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.city_posted', 'rtable.city_posted'])
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.pubYear', 'rtable.pubYear'])
    #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.isbn', 'rtable.isbn'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
    #k1 = np.array(C[['ltable.city_posted']])
    k1 = np.array(C[['ltable.pubYear']])
    #k1 = np.array(C[['ltable.isbn']])
    #k2 = np.array(C[['rtable.city_posted']])
    k2 = np.array(C[['rtable.pubYear']])
    #k2 = np.array(C[['rtable.isbn']])
    assert_equal(all(k1 == k2), True)
def test_bb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(
        ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    feature_table = mg.get_features_for_blocking(A, B)
    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy', 'dummy')
    fv = mg.extract_feature_vecs(D, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_ab_block_candset_skd():
    #A = mg.load_dataset('table_A')
    A = mg.load_dataset('bikedekho_clean', 'ID')
    #B = mg.load_dataset('table_B')
    B = mg.load_dataset('bikewale_clean', 'ID')
    ab = mg.AttrEquivalenceBlocker()
    #C = ab.block_tables(A, B, 'zipcode', 'zipcode', ['zipcode', 'birth_year'], ['zipcode', 'birth_year'])
    C = ab.block_tables_opt(A, B, 'city_posted', 'city_posted',
	['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'],
	['bike_name', 'city_posted', 'km_driven', 'price', 'color', 'model_year'])
    print "Size of C: ", len(C)
    #D = ab.block_candset_skd(C, 'birth_year', 'birth_year')
    D = ab.block_candset_joblib(C, 'model_year', 'model_year')
    print "Size of D: ", len(D)
    #s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'ltable.birth_year', 'rtable.zipcode',
    #             'rtable.birth_year'])
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.bike_name', 'ltable.city_posted',
	'ltable.km_driven', 'ltable.price', 'ltable.color', 'ltable.model_year',
	'rtable.bike_name', 'rtable.city_posted', 'rtable.km_driven', 'rtable.price',
	'rtable.color', 'rtable.model_year'])
    assert_equal(s1, sorted(D.columns))
    assert_equal(D.get_key(), '_id')
    assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
    #k1 = np.array(D[['ltable.birth_year']])
    k1 = np.array(D[['ltable.model_year']])
    #k2 = np.array(D[['rtable.birth_year']])
    k2 = np.array(D[['rtable.model_year']])
    assert_equal(all(k1 == k2), True)
Example #6
0
def test_save_load_rb_blocker():
    #mg.init_jvm()
    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    feature_table = mg.get_features_for_blocking(A, B)
    rb0 = mg.RuleBasedBlocker()
    rb0.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) == 1'], feature_table)
    rb0.add_rule(['birth_year_birth_year_anm(ltuple, rtuple) > 0.95',
                  'name_name_mel(ltuple, rtuple)> 0.4'],
                 feature_table)
    C0 = rb0.block_tables(A, B)
    mg.save_object(rb0, filename)
    rb1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass

    assert_equal(type(rb0), type(rb1))
    assert_equal(len(rb0.rules), len(rb1.rules))
    assert_equal(len(rb0.rule_source), len(rb1.rule_source))
    assert_equal(rb0.rule_cnt, rb1.rule_cnt)

    C1 = rb1.block_tables(A, B)
    assert_equal(len(C0), len(C1))
    assert_equal(sorted(C0.columns), sorted(C0.columns))
Example #7
0
def test_save_load_rb_blocker():
    #mg.init_jvm()
    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    feature_table = mg.get_features_for_blocking(A, B)
    rb0 = mg.RuleBasedBlocker()
    rb0.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) == 1'], feature_table)
    rb0.add_rule([
        'birth_year_birth_year_anm(ltuple, rtuple) > 0.95',
        'name_name_mel(ltuple, rtuple)> 0.4'
    ], feature_table)
    C0 = rb0.block_tables(A, B)
    mg.save_object(rb0, filename)
    rb1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass

    assert_equal(type(rb0), type(rb1))
    assert_equal(len(rb0.rules), len(rb1.rules))
    assert_equal(len(rb0.rule_source), len(rb1.rule_source))
    assert_equal(rb0.rule_cnt, rb1.rule_cnt)

    C1 = rb1.block_tables(A, B)
    assert_equal(len(C0), len(C1))
    assert_equal(sorted(C0.columns), sorted(C0.columns))
Example #8
0
def test_save_load_bb_blocker():
    #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
    from magellan.feature.simfunctions import jaccard
    from magellan.feature.tokenizers import tok_qgram

    def block_fn_1(ltuple, rtuple):
        val = jaccard(tok_qgram(ltuple['address'], 3),
                      tok_qgram(rtuple['address'], 3))
        if val < 0.4:
            return True
        else:
            return False

    bb0 = mg.BlackBoxBlocker()
    bb0.set_black_box_function(block_fn_1)
    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')

    C0 = bb0.block_tables(A, B)
    mg.save_object(bb0, filename)
    bb1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass

    assert_equal(type(bb0), type(bb1))
    C1 = bb1.block_tables(A, B)
    assert_equal(len(C0), len(C1))
    assert_equal(sorted(C0.columns), sorted(C0.columns))
Example #9
0
def test_ab_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    assert_equal(ab.block_tuples(A.ix[1], B.ix[2], 'zipcode', 'zipcode'),
                 False)
    assert_equal(ab.block_tuples(A.ix[2], B.ix[2], 'zipcode', 'zipcode'), True)
def test_rb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule([
        'name_name_mel(ltuple, rtuple) < 0.4',
        'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'
    ], feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table)
    C = rb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(
        ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy', 'dummy')
    fv = mg.extract_feature_vecs(D, feature_table=feature_table)
    expected_ids = fv.ix[((fv.name_name_mel >= 0.4) |
                          (fv.birth_year_birth_year_lev >= 0.5))
                         & fv.zipcode_zipcode_exm == 1, [
                             'ltable.ID',
                             'rtable.ID',
                         ]]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
Example #11
0
def test_save_load_feature_table():
    #mg.init_jvm()

    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    feature_table0 = mg.get_features_for_blocking(A, B)
    mg.save_object(feature_table0, filename)
    feature_table1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass
    assert_equal(type(feature_table0), type(feature_table1))
    assert_equal(len(feature_table0), len(feature_table1))
    assert sorted(feature_table0.columns) == sorted(feature_table1.columns)
    ft0_functions = list(feature_table0['function'])
    ft1_functions = list(feature_table1['function'])
    for f0, f1 in zip(ft0_functions, ft1_functions):
        a = f0(A.ix[1], B.ix[2])
        b = f1(A.ix[1], B.ix[2])
        if math.isnan(a) == False and math.isnan(b) == False:
            assert_equal(a, b)
        if math.isnan(a) == True:
            assert_equal(math.isnan(b), True)
def test_bb_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    assert_equal(bb.block_tuples(A.ix[0], B.ix[0]), True)
    assert_equal(bb.block_tuples(A.ix[2], B.ix[1]), False)
Example #13
0
def test_save_load_bb_blocker():
    #mg.init_jvm('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
    from magellan.feature.simfunctions import jaccard
    from magellan.feature.tokenizers import tok_qgram
    def block_fn_1(ltuple, rtuple):
        val = jaccard(tok_qgram(ltuple['address'], 3), tok_qgram(rtuple['address'], 3))
        if  val < 0.4:
            return True
        else:
            return False
    bb0 = mg.BlackBoxBlocker()
    bb0.set_black_box_function(block_fn_1)
    filename = '__mg_obj__.pkl'
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')

    C0 = bb0.block_tables(A, B)
    mg.save_object(bb0, filename)
    bb1 = mg.load_object(filename)
    try:
        os.remove(filename)
    except OSError:
        pass

    assert_equal(type(bb0), type(bb1))
    C1 = bb1.block_tables(A, B)
    assert_equal(len(C0), len(C1))
    assert_equal(sorted(C0.columns), sorted(C0.columns))
def test_bb_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    assert_equal(bb.block_tuples(A.ix[0], B.ix[0]), True)
    assert_equal(bb.block_tuples(A.ix[2], B.ix[1]), False)
Example #15
0
def test_ab_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'name', 'name')
    assert_equal(len(C), 0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_ab_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'name', 'name')
    assert_equal(len(C),  0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(evil_block_fn)
    C = bb.block_tables(A, B)
    assert_equal(len(C),  0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(evil_block_fn)
    C = bb.block_tables(A, B)
    assert_equal(len(C), 0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_rb_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'], feature_table)
    C = rb.block_tables(A, B)
    assert_equal(len(C), 0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
def test_rb_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['name_name_mel(ltuple, rtuple) < 0.4',
                 'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'],
                feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'],
                feature_table)

    assert_equal(rb.block_tuples(A.ix[0], B.ix[0]), False)
    assert_equal(rb.block_tuples(A.ix[1], B.ix[1]), True)
def test_bb_block_candset_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'birth_year', 'birth_year')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(evil_block_fn)
    D = bb.block_candset(C)
    assert_equal(len(D),  0)
    assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(D.get_key(), '_id')
    assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_candset_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'birth_year', 'birth_year')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(evil_block_fn)
    D = bb.block_candset(C)
    assert_equal(len(D), 0)
    assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(D.get_key(), '_id')
    assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_ab_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'zipcode', 'zipcode', 'zipcode', 'zipcode')
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
    k1 = np.array(C[['ltable.zipcode']])
    k2 = np.array(C[['rtable.zipcode']])
    assert_equal(all(k1 == k2), True)
def test_rb_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule([
        'name_name_mel(ltuple, rtuple) < 0.4',
        'birth_year_birth_year_lev(ltuple, rtuple) < 0.5'
    ], feature_table)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) != 1'], feature_table)

    assert_equal(rb.block_tuples(A.ix[0], B.ix[0]), False)
    assert_equal(rb.block_tuples(A.ix[1], B.ix[1]), True)
def test_rb_block_tables_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'],
                feature_table)
    C = rb.block_tables(A, B)
    assert_equal(len(C),  0)
    assert_equal(sorted(C.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
Example #26
0
def test_ab_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'zipcode', 'zipcode', 'zipcode', 'zipcode')
    s1 = sorted(
        ['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')
    k1 = np.array(C[['ltable.zipcode']])
    k2 = np.array(C[['rtable.zipcode']])
    assert_equal(all(k1 == k2), True)
def test_rb_block_candset_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'birth_year', 'birth_year')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'], feature_table)
    D = rb.block_candset(C)
    assert_equal(len(D), 0)
    assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(D.get_key(), '_id')
    assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_rb_block_candset_wi_no_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    C = ab.block_tables(A, B, 'birth_year', 'birth_year')
    rb = mg.RuleBasedBlocker()
    feature_table = mg.get_features_for_blocking(A, B)
    rb.add_rule(['zipcode_zipcode_exm(ltuple, rtuple) >= 0'],
                feature_table)
    D = rb.block_candset(C)
    assert_equal(len(D),  0)
    assert_equal(sorted(D.columns), sorted(['_id', 'ltable.ID', 'rtable.ID']))
    assert_equal(D.get_key(), '_id')
    assert_equal(D.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(D.get_property('foreign_key_rtable'), 'rtable.ID')
def test_bb_block_candset():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    E = ab.block_tables(A, B, 'zipcode', 'zipcode')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_candset(E)
    feature_table = mg.get_features_for_blocking(A, B)
    fv = mg.extract_feature_vecs(C, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4) ,
      ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_candset():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    E = ab.block_tables(A, B, 'zipcode', 'zipcode')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_candset(E)
    feature_table = mg.get_features_for_blocking(A, B)
    fv = mg.extract_feature_vecs(C, feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4), ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(
        expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(
        actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_bb_block_tables():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    bb = mg.BlackBoxBlocker()
    bb.set_black_box_function(block_fn)
    C = bb.block_tables(A, B, 'zipcode', 'zipcode')
    s1 = sorted(['_id', 'ltable.ID', 'rtable.ID', 'ltable.zipcode', 'rtable.zipcode'])
    assert_equal(s1, sorted(C.columns))
    assert_equal(C.get_key(), '_id')
    assert_equal(C.get_property('foreign_key_ltable'), 'ltable.ID')
    assert_equal(C.get_property('foreign_key_rtable'), 'rtable.ID')

    feature_table = mg.get_features_for_blocking(A, B)
    A['dummy'] = 1
    B['dummy'] = 1
    ab = mg.AttrEquivalenceBlocker()
    D = ab.block_tables(A, B, 'dummy','dummy')
    fv = mg.extract_feature_vecs(D,  feature_table=feature_table)
    expected_ids = fv.ix[(fv.name_name_mel >= 0.4) ,
      ['ltable.ID', 'rtable.ID']]
    actual_ids = C[['ltable.ID', 'rtable.ID']]
    ids_exp = list(expected_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    ids_act = list(actual_ids.set_index(['ltable.ID', 'rtable.ID']).index.values)
    assert_equal(cmp(ids_exp, ids_act), 0)
def test_ab_block_tuples():
    A = mg.load_dataset('table_A')
    B = mg.load_dataset('table_B')
    ab = mg.AttrEquivalenceBlocker()
    assert_equal(ab.block_tuples(A.ix[1], B.ix[2], 'zipcode', 'zipcode'), False)
    assert_equal(ab.block_tuples(A.ix[2], B.ix[2], 'zipcode', 'zipcode'), True)