def _gen_partition(weights, n): if isinstance(weights, int): weights = [1./weights]*weights elif isinstance(weights, (list, np.ndarray,)): if abs(sum(weights) - 1.) > 10E12: raise ValueError('weight should sum to 1.') else: msg = "{} is not valid type for weights".format(type(weights)) raise ValueError(msg) k = len(weights) if n == 1: return [0], weights # there should be at least one instance of each of the components in # weights z = list(range(k)) if n-k == 1: z += [pflip(weights, n=1)] elif n-k > 1: z += pflip(weights, n=n-k).tolist() assert min(z) == 0 assert max(z) == k-1 return z, weights
def _sample_multi_col(model, col_idxs, given=None, n=1): n_cols = len(col_idxs) assert n_cols > 1 view_idxs = [model['col_assignment'][col_idx] for col_idx in col_idxs] col2pos = dict((col_idx, i) for i, col_idx in enumerate(col_idxs)) view2col = dict() for col_idx, view_idx in zip(col_idxs, view_idxs): view2col[view_idx] = view2col.get(view_idx, []) + [col_idx] samples = np.zeros((n, n_cols,)) for i in range(n): for view, cols in view2col.items(): if given is None: weights = _get_view_weights(model, cols[0]) else: weights = _get_given_view_weights(model, cols[0], given) component_idx = pflip(weights) for col_idx in cols: f = DRAWFUNC[model['dtypes'][col_idx]] x = f(model, col_idx, component_idx) samples[i, col2pos[col_idx]] = x # import pdb; pdb.set_trace() if n > 1: return samples else: return samples[0, :]
def _sample_multi_col(model, col_idxs, given=None, n=1): n_cols = len(col_idxs) assert n_cols > 1 view_idxs = [model['col_assignment'][col_idx] for col_idx in col_idxs] col2pos = dict((col_idx, i) for i, col_idx in enumerate(col_idxs)) view2col = dict() for col_idx, view_idx in zip(col_idxs, view_idxs): view2col[view_idx] = view2col.get(view_idx, []) + [col_idx] samples = np.zeros(( n, n_cols, )) for i in range(n): for view, cols in view2col.items(): if given is None: weights = _get_view_weights(model, cols[0]) else: weights = _get_given_view_weights(model, cols[0], given) component_idx = pflip(weights) for col_idx in cols: f = DRAWFUNC[model['dtypes'][col_idx]] x = f(model, col_idx, component_idx) samples[i, col2pos[col_idx]] = x # import pdb; pdb.set_trace() if n > 1: return samples else: return samples[0, :]
def _sample_from_bivariate_discrete(p, n): n_rows, n_cols = p.shape samples = pflip(p.flatten(), n=n) x = np.zeros((n, 2)) for s, sample in enumerate(samples): i = int(sample / n_cols) j = int(sample % n_cols) x[s, 0] = i x[s, 1] = j return x
def _sample_single_col(model, col_idx, given=None, n=1): """ Samples data from the column at col_idx """ if given is None: weights = _get_view_weights(model, col_idx) else: weights = _get_given_view_weights(model, col_idx, given) component_idx = pflip(weights) f = DRAWFUNC[model['dtypes'][col_idx]] x = f(model, col_idx, component_idx) return x
def gen_mixture_data(n, mprop=.1): """ Generate 2-feature mixture data """ x = np.zeros((n, 2,)) weights = [0.3, 0.7] mu = [-1.0, 3.0] for i in range(n): k = pflip(weights) m = mu[k] x[i, :] = np.random.normal(m, size=2) df = pd.DataFrame(x) df.columns = ['x_1', 'x_2'] return df
def _sample_from_bivariate_discrete(p, n): n_rows, n_cols = p.shape samples = pflip(p.flatten(), n=n) x = np.zeros(( n, 2, )) for s, sample in enumerate(samples): i = int(sample / n_cols) j = int(sample % n_cols) x[s, 0] = i x[s, 1] = j return x
def gen_mixture_data(n, mprop=.1): """ Generate 2-feature mixture data """ x = np.zeros(( n, 2, )) weights = [0.3, 0.7] mu = [-1.0, 3.0] for i in range(n): k = pflip(weights) m = mu[k] x[i, :] = np.random.normal(m, size=2) df = pd.DataFrame(x) df.columns = ['x_1', 'x_2'] return df
def _categorical_draw(params): return pflip(params['p'])
assert min(z) == 0 assert max(z) == k-1 return z, weights PARAM_FUNCS = { 'continuous': _continuous_params, 'categorical': _categorical_params} LOGPDFS = { 'categorical': lambda x, alpha: np.log([alpha[xi] for xi in x]), 'continuous': norm.logpdf} DRAW = { 'categorical': lambda alpha: pflip(alpha), 'continuous': norm.rvs} # --- class DataGenerator(object): """ Generate and store data and its generating distribution Attributes ---------- dtypes : list(str) The datatype of each column, either 'continuous' or 'categorical'. df : pandas.DataFrame The generated data params : list(list(dict)) The distribution parameters the generated the data in each column