コード例 #1
0
ファイル: sampler.py プロジェクト: sanjibkd/enrique
def down_sample(ltable, rtable, size, y):
    s_table, b_table, is_swapped = _order_tables(ltable, rtable)
    s_inv_index = _inv_index(s_table)
    b_sample_size = min(math.floor(size/y), len(b_table))
    b_tbl_indices = np.random.choice(len(b_table), b_sample_size, replace=False)
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y,
                                 len(s_table), s_inv_index)
    if is_swapped:
        s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices
    l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], ltable.get_key())
    l_sampled.properties = ltable.properties
    r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], rtable.get_key())
    r_sampled.properties = rtable.properties
    return l_sampled, r_sampled
コード例 #2
0
ファイル: sampler.py プロジェクト: Yashg19/enrique
def sample_table(table, size, replace=False):
    """
    Sample MTable

    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.

    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table =  table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table
コード例 #3
0
def sample_table(table, size, replace=False):
    """
    Sample MTable

    Parameters
    ----------
    table : MTable, input table to be sampled
    size : int, number of samples
    replace : boolean, whether sampling should be done with replacement.
            By default, it is set to False.

    Returns
    -------
    sampled_table: MTable, sampled table
    """
    if len(table) == 0:
        raise AttributeError('size of table is 0')
    if len(table) < size:
        raise AttributeError('sample size is larger than input table size')

    s_indices = np.random.choice(len(table), size, replace=replace)
    # sort the indices - just to have an order
    s_indices = sorted(s_indices)
    sampled_table = table.iloc[list(s_indices)]
    #print sampled_table.properties
    sampled_table = MTable(sampled_table, key=table.get_key())
    sampled_table.properties = table.properties
    return sampled_table
コード例 #4
0
def down_sample(ltable, rtable, size, y):
    s_table, b_table, is_swapped = _order_tables(ltable, rtable)
    s_inv_index = _inv_index(s_table)
    b_sample_size = min(math.floor(size / y), len(b_table))
    b_tbl_indices = list(
        np.random.choice(len(b_table), b_sample_size, replace=False))
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y, len(s_table),
                                 s_inv_index)
    s_tbl_indices = list(s_tbl_indices)
    if is_swapped:
        s_tbl_indices, b_tbl_indices = b_tbl_indices, s_tbl_indices
    l_sampled = MTable(ltable.iloc[list(s_tbl_indices)], key=ltable.get_key())
    l_sampled.properties = ltable.properties
    r_sampled = MTable(rtable.iloc[list(b_tbl_indices)], key=rtable.get_key())
    r_sampled.properties = rtable.properties
    return l_sampled, r_sampled
コード例 #5
0
ファイル: sampler.py プロジェクト: Yashg19/enrique
def down_sample(s_table, b_table, size, y):
    if len(b_table) < size:
        print 'Warning!! size of table B is less than b_size parameter - using entire table B'
        size = len(b_table)

    t1 = time.time()
    s_inv_index = _inv_index(s_table)
    print 'Inverted Index Time: '
    print int(time.time() - t1)
    b_sample_size = min(math.floor(size/y), len(b_table))
    b_tbl_indices = list(np.random.choice(len(b_table), b_sample_size, replace=False))
    t1 = time.time()
    s_tbl_indices = _probe_index(b_table.ix[b_tbl_indices], y,
                                 len(s_table), s_inv_index)
    print 'Probe Index Time: '
    print int(time.time() - t1)

    s_tbl_indices = list(s_tbl_indices)
    l_sampled = MTable(s_table.iloc[list(s_tbl_indices)], key=s_table.get_key())
    l_sampled.properties = s_table.properties
    r_sampled = MTable(b_table.iloc[list(b_tbl_indices)], key=b_table.get_key())
    r_sampled.properties = b_table.properties
    return l_sampled, r_sampled
コード例 #6
0
def label_table(tbl, col_name, replace=True):
    """
    Label training data

    Parameters
    ----------
    tbl : MTable, Table to be labeled
    col_name : String, Name of the label column
    replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists.
    [This option is currently experimental].

    Returns
    -------
    result : MTable, Table with labels

    Notes
    -----
    The label value is expected to be only 0 or 1.
    """
    from magellan.gui.mtable_gui import edit
    table = tbl.copy()

    if col_name in table.columns:
        if replace == True:
            logging.getLogger(__name__).warning(
                'Input table already contains column %s. '
                '' % col_name)
            table[col_name] = 0
    else:
        table[col_name] = 0
    mg.edit(table)
    table[col_name] = table[col_name].astype(int)
    # check if the table contains only 0s and 1s
    c1 = table[col_name] == 1
    c2 = table[col_name] == 0
    c = sum(c1 | c2)
    assert c == len(
        table), 'The label column contains values other than 0 and 1'

    table = MTable(table, key=tbl.get_key())
    table.properties = tbl.properties
    return table
コード例 #7
0
ファイル: labeler.py プロジェクト: Yashg19/enrique
def label_table(tbl, col_name, replace=True):
    """
    Label training data

    Parameters
    ----------
    tbl : MTable, Table to be labeled
    col_name : String, Name of the label column
    replace : Boolean, specifies whether the column with the given 'col_name' must be overwritten, if it already exists.
    [This option is currently experimental].

    Returns
    -------
    result : MTable, Table with labels

    Notes
    -----
    The label value is expected to be only 0 or 1.
    """
    from magellan.gui.mtable_gui import edit
    table = tbl.copy()

    if col_name in table.columns:
        if replace == True:
            logging.getLogger(__name__).warning('Input table already contains column %s. '
                                                '' %col_name)
            table[col_name] = 0
    else:
        table[col_name] = 0
    mg.edit(table)
    table[col_name] = table[col_name].astype(int)
    # check if the table contains only 0s and 1s
    c1 = table[col_name] == 1
    c2 = table[col_name] == 0
    c = sum(c1|c2)
    assert c == len(table), 'The label column contains values other than 0 and 1'

    table = MTable(table, key=tbl.get_key())
    table.properties = tbl.properties
    return table
コード例 #8
0
ファイル: extractfeatures.py プロジェクト: Yashg19/enrique
def extract_feature_vecs(s, attrs_before=None, feature_table=None, attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property('foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %(len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table) for x in id_list]
    end = time.time()
    logging.getLogger(__name__).info('Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors
コード例 #9
0
ファイル: extractfeatures.py プロジェクト: epaulson/enrique
def extract_feature_vecs(s,
                         attrs_before=None,
                         feature_table=None,
                         attrs_after=None):
    """
    Extract feature vectors

    Parameters
    ----------
    s : MTable,
        labeled virtual MTable or combined blocker output
    attrs_before : list, defaults to None
        List of attribute names from "s" to be included in output table before the feature vector
    feat_table : pandas DataFrame, defaults to None
        List of features to be applied (also see: mg.get_features_for_blocking)
    attrs_after : list, defaults to None
        List of attribute names from "s" to be included in output table after the feature vector

    Returns
    -------
    feature_vectors : MTable,
        Containing features values (obtained by applying feature fns in feat_table) and attributes as
        mentioned in the input
    """
    # basic checks
    assert isJVMStarted(
    ), 'JVM should be started using init_jvm to compute features'
    ltable = s.get_property('ltable')
    rtable = s.get_property('rtable')
    assert ltable is not None, 'Left table is not set'
    assert rtable is not None, 'Right table is not set'

    if feature_table is None:
        feature_table = mg.get_features_for_blocking(ltable, rtable)

    l_key, r_key = s.get_property('foreign_key_ltable'), s.get_property(
        'foreign_key_rtable')
    start = time.time()
    id_list = [(r[l_key], r[r_key]) for i, r in s.iterrows()]
    end = time.time()
    logging.getLogger(__name__).info('Iterating rows (%d) took %f secs' %
                                     (len(s), end - start))

    # compute feature values
    l_df = ltable.to_dataframe()
    r_df = rtable.to_dataframe()
    l_df.set_index(ltable.get_key(), inplace=True, drop=False)
    r_df.set_index(rtable.get_key(), inplace=True, drop=False)

    start = time.time()
    feat_vals = [
        apply_feat_fns(l_df.ix[x[0]], r_df.ix[x[1]], feature_table)
        for x in id_list
    ]
    end = time.time()
    logging.getLogger(__name__).info(
        'Applying feature functions took : %f secs' % (end - start))
    table = pd.DataFrame(feat_vals, index=s.index.values)
    # get the feature names and re-arrange columns in that order
    feat_names = list(feature_table['feature_name'])
    table = table[feat_names]
    # insert attrs_before
    if attrs_before:
        if not isinstance(attrs_before, list):
            attrs_before = [attrs_before]
        attrs_before.reverse()
        for a in attrs_before:
            table.insert(0, a, s[a])
    table.insert(0, r_key, s[r_key])
    table.insert(0, l_key, s[l_key])

    # insert attrs after
    if attrs_after:
        if not isinstance(attrs_after, list):
            attrs_after = [attrs_after]
        attrs_after.reverse()
        for a in attrs_after:
            table.insert(len(table.columns), a, s[a])
    # reset the table index
    table.reset_index(inplace=True, drop=True)

    feature_vectors = MTable(table)
    if s.get_key() not in feature_vectors.columns:
        feature_vectors.add_key(s.get_key())
    # metadata
    feature_vectors._metadata = s._metadata
    feature_vectors.properties = s.properties
    return feature_vectors