def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
 def test_parse_feat_str_parse_valid_2(self):
     feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(ltuple['zipcode']))"
     p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['simfunction'], 'jaccard')
 def test_parse_feat_str_parse_valid_1(self):
     feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['simfunction'], 'jaccard')
     self.assertEqual(p_dict['left_attribute'], 'zipcode')
     self.assertEqual(p_dict['right_attribute'], 'zipcode')
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1+1, len2)
     self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1 + 1, len2)
     self.assertEqual(
         feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1],
                                                              B.ix[2]), 1.0)
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = afg.get_attr_types(A)
     r_attr_types = afg.get_attr_types(B)
     attr_corres = afg.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
Example #13
0
def get_features_for_matching(A, B):
    if not isinstance(A, pd.DataFrame):
        logger.error('Input table A is not of type pandas dataframe')
        raise AssertionError('Input table A is not of type pandas dataframe')

    if not isinstance(B, pd.DataFrame):
        logger.error('Input table B is not of type pandas dataframe')
        raise AssertionError('Input table B is not of type pandas dataframe')

    sim_funcs = get_sim_funs_for_matching()
    tok_funcs = get_tokenizers_for_matching()
    t_A = get_attr_types(A)
    t_B = get_attr_types(B)
    attr_corres = get_attr_corres(A, B)
    feat_table = get_features(A, B, t_A, t_B, attr_corres, tok_funcs, sim_funcs)

    # export important variables to global name space
    mg._match_t = tok_funcs
    mg._match_s = sim_funcs
    mg._atypes1 = t_A
    mg._atypes2 = t_B
    mg._match_c = attr_corres
    return feat_table
Example #14
0
def get_features_for_matching(A, B):
    if not isinstance(A, pd.DataFrame):
        logger.error('Input table A is not of type pandas dataframe')
        raise AssertionError('Input table A is not of type pandas dataframe')

    if not isinstance(B, pd.DataFrame):
        logger.error('Input table B is not of type pandas dataframe')
        raise AssertionError('Input table B is not of type pandas dataframe')

    sim_funcs = get_sim_funs_for_matching()
    tok_funcs = get_tokenizers_for_matching()
    t_A = get_attr_types(A)
    t_B = get_attr_types(B)
    attr_corres = get_attr_corres(A, B)
    feat_table = get_features(A, B, t_A, t_B, attr_corres, tok_funcs,
                              sim_funcs)

    # export important variables to global name space
    mg._match_t = tok_funcs
    mg._match_s = sim_funcs
    mg._atypes1 = t_A
    mg._atypes2 = t_B
    mg._match_c = attr_corres
    return feat_table
Example #15
0
 def test_get_sim_funs_for_blocking(self):
     x = sim.get_sim_funs_for_matching()
     l1 = list(x.keys())
     self.assertEqual(len(l1), len(sim.sim_fn_names))
     self.assertEqual(sorted(l1), sorted(sim.sim_fn_names))
 def test_parse_feat_str_parse_exp(self):
     feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     for k,v in six.iteritems(p_dict):
         self.assertEqual(v, 'PARSE_EXP')
 def test_parse_feat_str_parse_exp(self):
     feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     for k, v in six.iteritems(p_dict):
         self.assertEqual(v, 'PARSE_EXP')
 def test_get_sim_funs_for_blocking(self):
     x = sim.get_sim_funs_for_matching()
     l1 = list(x.keys())
     self.assertEqual(len(l1), len(sim.sim_fn_names))
     self.assertEqual(sorted(l1), sorted(sim.sim_fn_names))