Example #1
0
def get_features_for_matching(A, B):
    """
    Get features with minimal input

    Parameters
    ----------
    A, B : MTable,
        Input tables

    Returns
    -------
    feature_table : pandas DataFrame
        Consists of following columns
        * feature_name  - string, feature name
        * left_attribute - string, attribute name
        * right_attribute - string, attribute name
        * left_attr_tokenizer - string, tokenizer name
        * right_attr_tokenizer - string, tokenizer name
        * simfunction - string, sumilarity function name
        * function - function object
        * function_source - string, containing source code

    Notes
    -----
    The function also exports the important variables such as: _match_t, _match_s, _atypes1, _atypes2, _match_c;
     to global name space so if a user want to examine they can do so.
    """
    sim = mg.get_sim_funs()
    tok = mg.get_single_arg_tokenizers()
    t_A = mg.get_attr_types(A)
    t_B = mg.get_attr_types(B)
    attr_corres = mg.get_attr_corres(A, B)
    feat_table = get_features(A, B, t_A, t_B, attr_corres, tok, sim)

    # export important variables to global name space
    #_m_current_tokenizers, _m_current_sim_funs, _m_current_attr_types_ltable, _m_current_attr_types_rtable_m_current_corres
    mg._match_t = tok
    mg._match_s = sim
    mg._atypes1 = t_A
    mg._atypes2 = t_B
    mg._match_c = attr_corres
    return feat_table
Example #2
0
def get_features_for_matching(A, B):
    """
    Get features with minimal input

    Parameters
    ----------
    A, B : MTable,
        Input tables

    Returns
    -------
    feature_table : pandas DataFrame
        Consists of following columns
        * feature_name  - string, feature name
        * left_attribute - string, attribute name
        * right_attribute - string, attribute name
        * left_attr_tokenizer - string, tokenizer name
        * right_attr_tokenizer - string, tokenizer name
        * simfunction - string, sumilarity function name
        * function - function object
        * function_source - string, containing source code

    Notes
    -----
    The function also exports the important variables such as: _match_t, _match_s, _atypes1, _atypes2, _match_c;
     to global name space so if a user want to examine they can do so.
    """
    sim = mg.get_sim_funs()
    tok = mg.get_single_arg_tokenizers()
    t_A = mg.get_attr_types(A)
    t_B = mg.get_attr_types(B)
    attr_corres = mg.get_attr_corres(A, B)
    feat_table = get_features(A, B, t_A, t_B, attr_corres, tok, sim)

    # export important variables to global name space
    #_m_current_tokenizers, _m_current_sim_funs, _m_current_attr_types_ltable, _m_current_attr_types_rtable_m_current_corres
    mg._match_t = tok
    mg._match_s = sim
    mg._atypes1 = t_A
    mg._atypes2 = t_B
    mg._match_c = attr_corres
    return feat_table
Example #3
0
import sys
sys.path.append('C:\Pradap\Research\Python-work\Saranam\magellan')

A = mg.read_csv('../magellan/data/toy/table_A.csv', key='ID')
B = mg.read_csv('../magellan/data/toy/table_B.csv', key='ID')

blocker = mg.AttrEquivalenceBlocker()
C = blocker.block_tables(A, B, 'zipcode', 'zipcode', ['name', 'address', 'hourly_wage'], ['name', 'address', 'hourly_wage'])


D = mg.block_union_combine([C, C])

S = mg.sample_one_table(D, 10)

#L = mg.label(S, 'gold_label')

#print mg._m_global_tokenizers
#print mg._m_global_sim_fns

t = mg.get_single_arg_tokenizers()
print t
s = mg.get_sim_funs()
print s
corres = mg.get_attr_corres(A, B)
print corres['corres']
t_1 = mg.get_attr_types(A)
print t_1
t_2 = mg.get_attr_types(B)
print t_2