Example #1
0
# In[22]:

# label candidate set and name the label column as gold_label
L = mg.label_table(S, 'gold_label')


# In[ ]:




# In[24]:

# get features automatically (internally it computes types, attr_corres, sim functions, tokenizers )
feat_table = mg.get_features_for_blocking(A, B)


# In[25]:

# display feature table
feat_table


# In[26]:

# see what tokenizers were used to generate features
mg._current_tokenizers


# In[27]:
Example #2
0
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property('foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %(len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]
    end = time.time()
    logging.getLogger(__name__).info('Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors
Example #3
0
# In[21]:

# sample candidate set F
S = mg.sample_table(F, 13)

# In[22]:

# label candidate set and name the label column as gold_label
L = mg.label_table(S, 'gold_label')

# In[ ]:

# In[24]:

# get features automatically (internally it computes types, attr_corres, sim functions, tokenizers )
feat_table = mg.get_features_for_blocking(A, B)

# In[25]:

# display feature table
feat_table

# In[26]:

# see what tokenizers were used to generate features
mg._current_tokenizers

# In[27]:

# see what simfunctions were used to generate features
mg._current_sim_funs
Example #4
0
# coding=utf-8
import magellan as mg

A = mg.load_dataset('table_A', key='ID')
B = mg.load_dataset('table_B', key='ID')
F = mg.get_features_for_blocking(A, B)
print(F)

Example #5
0
def extract_feature_vecs(s,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(
    ), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property(
        'foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %
                                     (len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]
    end = time.time()
    logging.getLogger(__name__).info(
        'Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors