Exemple #1
0
def _gen_partition(weights, n):
    if isinstance(weights, int):
        weights = [1./weights]*weights
    elif isinstance(weights, (list, np.ndarray,)):
        if abs(sum(weights) - 1.) > 10E12:
            raise ValueError('weight should sum to 1.')
    else:
        msg = "{} is not valid type for weights".format(type(weights))
        raise ValueError(msg)

    k = len(weights)

    if n == 1:
        return [0], weights

    # there should be at least one instance of each of the components in
    # weights
    z = list(range(k))
    if n-k == 1:
        z += [pflip(weights, n=1)]
    elif n-k > 1:
        z += pflip(weights, n=n-k).tolist()

    assert min(z) == 0
    assert max(z) == k-1

    return z, weights
Exemple #2
0
def _sample_multi_col(model, col_idxs, given=None, n=1):
    n_cols = len(col_idxs)
    assert n_cols > 1

    view_idxs = [model['col_assignment'][col_idx] for col_idx in col_idxs]

    col2pos = dict((col_idx, i) for i, col_idx in enumerate(col_idxs))

    view2col = dict()
    for col_idx, view_idx in zip(col_idxs, view_idxs):
        view2col[view_idx] = view2col.get(view_idx, []) + [col_idx]

    samples = np.zeros((n, n_cols,))
    for i in range(n):
        for view, cols in view2col.items():
            if given is None:
                weights = _get_view_weights(model, cols[0])
            else:
                weights = _get_given_view_weights(model, cols[0], given)
            component_idx = pflip(weights)
            for col_idx in cols:
                f = DRAWFUNC[model['dtypes'][col_idx]]
                x = f(model, col_idx, component_idx)
                samples[i, col2pos[col_idx]] = x

    # import pdb; pdb.set_trace()
    if n > 1:
        return samples
    else:
        return samples[0, :]
Exemple #3
0
def _sample_multi_col(model, col_idxs, given=None, n=1):
    n_cols = len(col_idxs)
    assert n_cols > 1

    view_idxs = [model['col_assignment'][col_idx] for col_idx in col_idxs]

    col2pos = dict((col_idx, i) for i, col_idx in enumerate(col_idxs))

    view2col = dict()
    for col_idx, view_idx in zip(col_idxs, view_idxs):
        view2col[view_idx] = view2col.get(view_idx, []) + [col_idx]

    samples = np.zeros((
        n,
        n_cols,
    ))
    for i in range(n):
        for view, cols in view2col.items():
            if given is None:
                weights = _get_view_weights(model, cols[0])
            else:
                weights = _get_given_view_weights(model, cols[0], given)
            component_idx = pflip(weights)
            for col_idx in cols:
                f = DRAWFUNC[model['dtypes'][col_idx]]
                x = f(model, col_idx, component_idx)
                samples[i, col2pos[col_idx]] = x

    # import pdb; pdb.set_trace()
    if n > 1:
        return samples
    else:
        return samples[0, :]
Exemple #4
0
def _sample_from_bivariate_discrete(p, n):
    n_rows, n_cols = p.shape
    samples = pflip(p.flatten(), n=n)
    x = np.zeros((n, 2))
    for s, sample in enumerate(samples):
        i = int(sample / n_cols)
        j = int(sample % n_cols)
        x[s, 0] = i
        x[s, 1] = j

    return x
Exemple #5
0
def _sample_single_col(model, col_idx, given=None, n=1):
    """ Samples data from the column at col_idx """
    if given is None:
        weights = _get_view_weights(model, col_idx)
    else:
        weights = _get_given_view_weights(model, col_idx, given)
    component_idx = pflip(weights)
    f = DRAWFUNC[model['dtypes'][col_idx]]
    x = f(model, col_idx, component_idx)

    return x
Exemple #6
0
def _sample_single_col(model, col_idx, given=None, n=1):
    """ Samples data from the column at col_idx """
    if given is None:
        weights = _get_view_weights(model, col_idx)
    else:
        weights = _get_given_view_weights(model, col_idx, given)
    component_idx = pflip(weights)
    f = DRAWFUNC[model['dtypes'][col_idx]]
    x = f(model, col_idx, component_idx)

    return x
Exemple #7
0
def gen_mixture_data(n, mprop=.1):
    """ Generate 2-feature mixture data """
    x = np.zeros((n, 2,))
    weights = [0.3, 0.7]
    mu = [-1.0, 3.0]
    for i in range(n):
        k = pflip(weights)
        m = mu[k]
        x[i, :] = np.random.normal(m, size=2)

    df = pd.DataFrame(x)
    df.columns = ['x_1', 'x_2']

    return df
Exemple #8
0
def _sample_from_bivariate_discrete(p, n):
    n_rows, n_cols = p.shape
    samples = pflip(p.flatten(), n=n)
    x = np.zeros((
        n,
        2,
    ))
    for s, sample in enumerate(samples):
        i = int(sample / n_cols)
        j = int(sample % n_cols)
        x[s, 0] = i
        x[s, 1] = j

    return x
Exemple #9
0
def gen_mixture_data(n, mprop=.1):
    """ Generate 2-feature mixture data """
    x = np.zeros((
        n,
        2,
    ))
    weights = [0.3, 0.7]
    mu = [-1.0, 3.0]
    for i in range(n):
        k = pflip(weights)
        m = mu[k]
        x[i, :] = np.random.normal(m, size=2)

    df = pd.DataFrame(x)
    df.columns = ['x_1', 'x_2']

    return df
Exemple #10
0
def _categorical_draw(params):
    return pflip(params['p'])
Exemple #11
0
    assert min(z) == 0
    assert max(z) == k-1

    return z, weights


PARAM_FUNCS = {
    'continuous': _continuous_params,
    'categorical': _categorical_params}

LOGPDFS = {
    'categorical': lambda x, alpha: np.log([alpha[xi] for xi in x]),
    'continuous': norm.logpdf}

DRAW = {
    'categorical': lambda alpha: pflip(alpha),
    'continuous': norm.rvs}


# ---
class DataGenerator(object):
    """ Generate and store data and its generating distribution

    Attributes
    ----------
    dtypes : list(str)
        The datatype of each column, either 'continuous' or 'categorical'.
    df : pandas.DataFrame
        The generated data
    params : list(list(dict))
        The distribution parameters the generated the data in each column