def test_parse_feat_str_parse_valid_2(self): feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(ltuple['zipcode']))" p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3') self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3') self.assertEqual(p_dict['simfunction'], 'jaccard')
def test_add_feature_invalid_df_columns(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(pd.DataFrame(), 'test', f_dict)
def test_parse_feat_str_parse_valid_1(self): feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(rtuple['zipcode']))" p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3') self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3') self.assertEqual(p_dict['simfunction'], 'jaccard') self.assertEqual(p_dict['left_attribute'], 'zipcode') self.assertEqual(p_dict['right_attribute'], 'zipcode')
def test_get_features_invalid_ltable_rtable_switch(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = afg.get_attr_types(A) r_attr_types = afg.get_attr_types(B) attr_corres = afg.get_attr_corres(B, A) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
def test_add_feature_name_already_present(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = create_feature_table() len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) add_feature(feature_table, 'test', f_dict)
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1+1, len2) self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_add_features_valid_1(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') feature_table = get_features_for_matching(A, B) len1 = len(feature_table) feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])" f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) add_feature(feature_table, 'test', f_dict) len2 = len(feature_table) self.assertEqual(len1 + 1, len2) self.assertEqual( feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1], B.ix[2]), 1.0)
def test_get_features_valid(self): A = read_csv_metadata(path_a) B = read_csv_metadata(path_b, key='ID') l_attr_types = afg.get_attr_types(A) r_attr_types = afg.get_attr_types(B) attr_corres = afg.get_attr_corres(A, B) tok = get_tokenizers_for_matching() sim = get_sim_funs_for_matching() feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim) self.assertEqual(isinstance(feat_table, pd.DataFrame), True) functions = feat_table['function'] for f in functions: x = f(A.ix[1], B.ix[2]) self.assertEqual(x >= 0, True)
def get_features_for_matching(A, B): if not isinstance(A, pd.DataFrame): logger.error('Input table A is not of type pandas dataframe') raise AssertionError('Input table A is not of type pandas dataframe') if not isinstance(B, pd.DataFrame): logger.error('Input table B is not of type pandas dataframe') raise AssertionError('Input table B is not of type pandas dataframe') sim_funcs = get_sim_funs_for_matching() tok_funcs = get_tokenizers_for_matching() t_A = get_attr_types(A) t_B = get_attr_types(B) attr_corres = get_attr_corres(A, B) feat_table = get_features(A, B, t_A, t_B, attr_corres, tok_funcs, sim_funcs) # export important variables to global name space mg._match_t = tok_funcs mg._match_s = sim_funcs mg._atypes1 = t_A mg._atypes2 = t_B mg._match_c = attr_corres return feat_table
def test_get_tokenizers_for_matching(self): x = tok.get_tokenizers_for_matching() self.assertEqual(isinstance(x, dict), True) input = 'data science' for name, value in six.iteritems(x): self.assertEqual(isinstance(value(input), list), True)
def test_parse_feat_str_parse_exp(self): feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))" p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) for k,v in six.iteritems(p_dict): self.assertEqual(v, 'PARSE_EXP')
def test_parse_feat_str_parse_exp(self): feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))" p_dict = parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching()) for k, v in six.iteritems(p_dict): self.assertEqual(v, 'PARSE_EXP')
def test_get_tokenizers_for_matching_invalid(self): x = tok.get_tokenizers_for_matching(None, None)