def test_engine_with_mapper_stl(gendf): engine = Engine(gendf(), n_models=4, mapper=lambda f, args: list(map(f, args))) engine.init_models() engine.run(2) assert len(engine.models) == 4
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['continuous']*n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) engine = Engine(dg.df, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def test_engine_run_smoke_multiple(gendf): df = gendf() engine = Engine(df, n_models=10, use_mp=False) engine.init_models() engine.run() engine.run(10) assert len(engine.models) == 10
def test_engine_with_mapper_ipyparallel(gendf): c = ipp.Client() v = c[:] engine = Engine(gendf(), n_models=4, mapper=v.map) engine.init_models() engine.run(2) assert len(engine.models) == 4
def test_engine_with_mapper_mp(gendf): pool = Pool() with Pool() as pool: engine = Engine(gendf(), n_models=4, mapper=pool.map) engine.init_models() engine.run(2) assert len(engine.models) == 4
def gen_comp_engines(df, subsample_size): engine_full = Engine(df, n_models=8, use_mp=False) engine_full.init_models() engine_full.run(100) engine_mod = Engine(df, n_models=8, use_mp=False) engine_mod.init_models(subsample_size=subsample_size) engine_mod.run(100) return engine_full, engine_mod
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['continuous'] * n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) engine = Engine(dg.df, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def run(n_models=10, n_iter=200, iter_step=10, n_needles=2, n_distractors=8, n_rows=100, pairtype=None, pair_kws=None): needle_idxs = [(2*i, 2*i+1,) for i in range(n_needles)] needle_cols = list(range(n_needles*2)) distractor_cols = list(range(n_needles*2, n_needles*2+n_distractors)) combs = list(it.product(needle_cols, distractor_cols)) distractor_idxs = random.sample(combs, min(len(combs), 32)) df = _gen_data(n_needles, n_distractors, n_rows, pairtype, pair_kws) engine = Engine(df, n_models=n_models) engine.init_models() # for model in engine._models: # # XXX: emulates the log grid expected alpha # # e.g. mean(exp(linspace(log(1/n_rows), log(rows)))) # # model['state_alpha'] = .5*(n_needles*2. + n_distractors) # model['state_alpha'] = 100. # no column_alpha transition tlist = [b'row_assignment', b'column_assignment', b'row_alpha', b'column_hypers'] n_steps = int(n_iter/iter_step) needle_dps = np.zeros((n_needles, n_steps+1,)) distractor_dps = np.zeros((len(distractor_idxs), n_steps+1,)) for i in range(n_steps+1): engine.run(iter_step, trans_kwargs={'transition_list': tlist}) # engine.run(iter_step) for nidx, (a, b) in enumerate(needle_idxs): a = df.columns[a] b = df.columns[b] needle_dps[nidx, i] = engine.dependence_probability(a, b) for didx, (a, b) in enumerate(distractor_idxs): a = df.columns[a] b = df.columns[b] distractor_dps[didx, i] = engine.dependence_probability(a, b) iter_count = np.cumsum([1]+[iter_step]*n_steps) for y in distractor_dps: plt.plot(iter_count, y, color='gray', alpha=.3) for y in needle_dps: plt.plot(iter_count, y, color='crimson') # plt.gca().set_xscale('log') plt.ylim([-.05, 1.05]) plt.xlim([1, iter_count[-1]]) plt.show() engine.heatmap('dependence_probability') plt.show()
def test_view_alpha_should_change_if_transition(gendf): df = gendf() engine = Engine(df, n_models=1, use_mp=False) engine.init_models() view_alpha_start = engine._models[0]['view_alphas'] engine.run(10) view_alpha_end = engine._models[0]['view_alphas'] assert view_alpha_start != view_alpha_end
def engine(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1) df.columns = ["c_%d" % i for i in range(12)] engine = Engine(df) engine.init_models(8) engine.run(20) return engine
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None): rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9] true_mis = np.zeros(len(rhos)) mis = np.zeros((n_times, len(rhos))) for i, rho in enumerate(rhos): print("Rho: %1.1f" % (rho,)) if vartype == "categorical": p, true_mi = _gen_categorical_joint_dist(rho, n_grid) metadata = { "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, } elif vartype == "continuous": true_mi = -0.5 * log(1.0 - rho ** 2.0) metadata = {} else: raise ValueError("invalid vartype") for t in range(n_times): if vartype == "categorical": x = _sample_from_bivariate_discrete(p, n) elif vartype == "continuous": sigma = np.array([[1, rho], [rho, 1]]) mu = np.zeros(2) x = np.random.multivariate_normal(mu, sigma, size=n) else: raise ValueError("invalid vartype") df = pd.DataFrame(x, columns=["x_1", "x_2"]) engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models() engine.run(n_iter) true_mis[i] = true_mi mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False) if ax is not None: ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat") ax.plot(rhos, true_mis, label="True") ax.set_xlabel("rho") ax.set_ylabel("Mutual Information") ax.set_title(vartype) ax.legend(loc=0) else: return mis, true_mis
def test_logp_scaling(df): engine = Engine(df) engine.init_models(8) engine.run(500) x = np.linspace(3, 7, 200) p_true = norm.pdf(x, loc=5., scale=.5) lp_baxcat = engine.probability(x[:, np.newaxis], ['t'], given=[('x', 1), ('y', 2)]) inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR) assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
def test_view_alpha_should_not_change_if_no_transition(gendf): df = gendf() engine = Engine(df, n_models=1, use_mp=False) engine.init_models() view_alpha_start = engine._models[0]['view_alphas'] t_list = [b'row_assignment', b'column_alpha'] engine.run(10, trans_kwargs={'transition_list': t_list}) view_alpha_end = engine._models[0]['view_alphas'] assert view_alpha_start == view_alpha_end
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['categorical'] * n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) col_md = {'dtype': 'categorical', 'values': [0, 1, 2, 3, 4]} md = dict(( col, col_md, ) for col in range(n_cols)) engine = Engine(dg.df, metadata=md, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def engine(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1) df.columns = ['c_%d' % i for i in range(12)] engine = Engine(df) engine.init_models(8) engine.run(20) return engine
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None): xo, yo = shapefunc(n) s1 = pd.Series(xo) s2 = pd.Series(yo) df = pd.concat([s1, s2], axis=1) df.columns = ['x', 'y'] engine = Engine(df, n_models=n_models, use_mp=True) engine.init_models(subsample_size=subsample_size) engine.run(n_iter) xy = engine.sample(['x', 'y'], n=n) xe = xy[:, 0] ye = xy[:, 1] return xo, yo, xe, ye
def test_run_with_checkpoint_valid_diagnostic_output(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() engine.run(10, checkpoint=5) tables = engine._diagnostic_tables assert len(tables) == 5 for table in tables: assert len(table) == 3 for entry in table: assert 'log_score' in entry assert 'iters' in entry assert 'time' in entry
def test_run_on_model_subset_should_only_run_those_models(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() engine.run(10, checkpoint=5) engine.run(10, checkpoint=5, model_idxs=[1, 2]) tables = engine._diagnostic_tables assert len(tables) == 5 assert len(tables[0]) == 3 assert len(tables[1]) == 5 assert len(tables[2]) == 5 assert len(tables[3]) == 3 assert len(tables[4]) == 3
def test_dependence_probability(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=20, use_mp=False) engine.init_models() engine.run(10) depprob_01 = engine.dependence_probability('c0', 'c1') depprob_02 = engine.dependence_probability('c0', 'c2') depprob_12 = engine.dependence_probability('c1', 'c2') assert depprob_01 > depprob_02 assert depprob_01 > depprob_12
def test_pairwise_dependence_probability(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=10, use_mp=False) engine.init_models() engine.run(5) depprob = engine.pairwise_func('dependence_probability') assert depprob.ix[0, 0] == 1. assert depprob.ix[1, 1] == 1. assert depprob.ix[2, 2] == 1. assert depprob.ix[0, 1] == depprob.ix[1, 0] assert depprob.ix[0, 2] == depprob.ix[2, 0] assert depprob.ix[1, 2] == depprob.ix[2, 1]
def gen_engine_half(df): engine = Engine(df, n_models=4, use_mp=False) engine.init_models(subsample_size=0.5) engine.run(10) return engine
fname = func.__name__ s = pd.Series([fname]*n) df = pd.concat([s, pd.Series(xo), pd.Series(yo)], axis=1) df.columns = ['func', 'x', 'y'] dfs.append(df) ax = axes[0, i] ax.scatter(xo, yo, color='crimson', alpha=.3) ax = axes[1, i] ax.scatter(xe, ye, color='gray', alpha=.3) ax.set_xlim(axes[0, i].get_xlim()) ax.set_ylim(axes[0, i].get_ylim()) df = pd.concat(dfs, ignore_index=True) engine = Engine(df, n_models=8) engine.init_models() engine.run(1000, checkpoint=20) dfs = [] for i, func in enumerate(funcs): func_name = func.__name__ x = engine.sample(['x', 'y'], given=[('func', func_name)], n=n) ax = axes[2, i] ax.scatter(x[:, 0], x[:, 1], color='navy', alpha=.3) ax.set_xlim(axes[0, i].get_xlim()) ax.set_ylim(axes[0, i].get_ylim()) plt.show()
def gen_engine(df): engine = Engine(df, n_models=2, use_mp=False) engine.init_models() engine.run(10) # print(engine.col_info()) return engine
def gen_engine_full(df): engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine.run(10) return engine
x = np.random.randn() * std + mu data.append([x, a, b]) return pd.DataFrame(data) n_rows = 100 n_cols = 32 da = gen_phenotype_data(n_rows) db = pd.DataFrame(np.random.randint(3, size=( n_rows, n_cols, ))) df = pd.concat([da, db], axis=1) df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)] engine = Engine(df, n_models=32) engine.init_models() engine.run(100) for col in df.columns: if col != 'T': print("1/H(%s|T) = %f" % (col, 1 / engine.conditional_entropy(col, 'T'))) engine.heatmap('dependence_probability') plt.show()
fname = func.__name__ s = pd.Series([fname] * n) df = pd.concat([s, pd.Series(xo), pd.Series(yo)], axis=1) df.columns = ['func', 'x', 'y'] dfs.append(df) ax = axes[0, i] ax.scatter(xo, yo, color='crimson', alpha=.3) ax = axes[1, i] ax.scatter(xe, ye, color='gray', alpha=.3) ax.set_xlim(axes[0, i].get_xlim()) ax.set_ylim(axes[0, i].get_ylim()) df = pd.concat(dfs, ignore_index=True) engine = Engine(df, n_models=8) engine.init_models() engine.run(1000, checkpoint=20) dfs = [] for i, func in enumerate(funcs): func_name = func.__name__ x = engine.sample(['x', 'y'], given=[('func', func_name)], n=n) ax = axes[2, i] ax.scatter(x[:, 0], x[:, 1], color='navy', alpha=.3) ax.set_xlim(axes[0, i].get_xlim()) ax.set_ylim(axes[0, i].get_ylim()) plt.show()
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype='continuous', ax=None): rhos = [.1, .25, .4, .5, .75, .9] true_mis = np.zeros(len(rhos)) mis = np.zeros(( n_times, len(rhos), )) for i, rho in enumerate(rhos): print('Rho: %1.1f' % (rho, )) if vartype == 'categorical': p, true_mi = _gen_categorical_joint_dist(rho, n_grid) metadata = { 'x_1': { 'dtype': 'categorical', 'values': [i for i in range(n_grid)] }, 'x_2': { 'dtype': 'categorical', 'values': [i for i in range(n_grid)] } } elif vartype == 'continuous': true_mi = -.5 * log(1. - rho**2.) metadata = {} else: raise ValueError('invalid vartype') for t in range(n_times): if vartype == 'categorical': x = _sample_from_bivariate_discrete(p, n) elif vartype == 'continuous': sigma = np.array([[1, rho], [rho, 1]]) mu = np.zeros(2) x = np.random.multivariate_normal(mu, sigma, size=n) else: raise ValueError('invalid vartype') df = pd.DataFrame(x, columns=['x_1', 'x_2']) engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models() engine.run(n_iter) true_mis[i] = true_mi mis[t, i] = engine.mutual_information('x_1', 'x_2', n_samples=500, normed=False) if ax is not None: ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label='BaxCat') ax.plot(rhos, true_mis, label='True') ax.set_xlabel('rho') ax.set_ylabel('Mutual Information') ax.set_title(vartype) ax.legend(loc=0) else: return mis, true_mis
import seaborn as sns from baxcat.engine import Engine x = np.hstack(( np.random.randn(100) - 6, np.random.randn(100) * 3, np.random.randn(100) + 6, )) s1 = pd.Series(x) df = pd.DataFrame(s1, columns=['x']) engine = Engine(df, n_models=8) engine.init_models() engine.run(100) y = engine.sample('x', n=300) plt.subplot(1, 2, 1) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model') plt.xlim([-10, 10]) engine_sub = Engine(df, n_models=8) engine_sub.init_models(.5) engine_sub.run(100) y = engine_sub.sample('x', n=300) plt.subplot(1, 2, 2) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model')
s_a2 = pd.Series(np.random.randn(n) - 2.) s_b1 = pd.Series(np.ones(n, dtype=int)) s_b2 = pd.Series(np.random.randn(n) + 2.) df = pd.concat([pd.concat([s_a1, s_a2], axis=1), pd.concat([s_b1, s_b2], axis=1)], axis=0) assert df.shape == (2*n, 2,) df.columns = ['label', 'x'] engine = Engine(df, n_models=8) engine.init_models() engine.run(200) x = np.linspace(-6., 6., 200)[np.newaxis].T p_01 = np.exp(engine.probability(x, ['x'])) p_0 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 0,)])) p_1 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 1,)])) plt.figure(figsize=(4, 4,)) plt.hist(df['x'], 31, histtype='stepfilled', color='#aaaaaa', edgecolor='None', normed=True) plt.plot(x.flatten(), p_0, label='p(x|label=0)') plt.plot(x.flatten(), p_1, label='p(x|label=1)') plt.plot(x.flatten(), p_01, ls='--', label='p(x)') plt.xlabel('x') plt.ylabel('PDF')
std = 1. for i in range(n_rows): a = np.random.randint(3) b = np.random.randint(3) mu = mus[a, b] x = np.random.randn()*std + mu data.append([x, a, b]) return pd.DataFrame(data) n_rows = 100 n_cols = 32 da = gen_phenotype_data(n_rows) db = pd.DataFrame(np.random.randint(3, size=(n_rows, n_cols,))) df = pd.concat([da, db], axis=1) df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)] engine = Engine(df, n_models=32) engine.init_models() engine.run(100) for col in df.columns: if col != 'T': print("1/H(%s|T) = %f" % (col, 1/engine.conditional_entropy(col, 'T'))) engine.heatmap('dependence_probability') plt.show()
s_b1 = pd.Series(np.ones(n, dtype=int)) s_b2 = pd.Series(np.random.randn(n) + 2.) df = pd.concat( [pd.concat([s_a1, s_a2], axis=1), pd.concat([s_b1, s_b2], axis=1)], axis=0) assert df.shape == ( 2 * n, 2, ) df.columns = ['label', 'x'] engine = Engine(df, n_models=8) engine.init_models() engine.run(200) x = np.linspace(-6., 6., 200)[np.newaxis].T p_01 = np.exp(engine.probability(x, ['x'])) p_0 = .5 * np.exp(engine.probability(x, ['x'], given=[( 'label', 0, )])) p_1 = .5 * np.exp(engine.probability(x, ['x'], given=[( 'label', 1, )])) plt.figure(figsize=( 4,
# how to model each column. engine = Engine(df, n_models=32) # We can see how baxcat decided to model each column by checking `col_info` col_info = engine.col_info() print(col_info) # To do inference, we intialize some cross-categorization states with # `init_models` then `run` the inference. We intitialize many models to hedge # the inferences we make. Every model is a draw from the posterior. We want to # make inference about the data given the posterior distribution of states, so # we take several models. print('Initializing 32 models...') engine.init_models() print('Running models for 200 iterations...') engine.run(200, checkpoint=5) # To check whether inference has converges, we plot the log score for each # model as a function of time and make sure they all have leveled out. engine.convergence_plot() plt.show() # We can view which columns are dependent on which other columns by plotting # a n_cols by n_cols matrix where each cell is the dependence probability # between two columns. Note that the dependence probability is simply the # probability that a dependence exists, not the strength of the dependence. engine.heatmap('dependence_probability', plot_kwargs={'figsize': ( 10, 10, )}) plt.show()
import matplotlib.pyplot as plt import seaborn as sns from baxcat.engine import Engine x = np.hstack(( np.random.randn(100) - 6, np.random.randn(100)*3, np.random.randn(100) + 6,)) s1 = pd.Series(x) df = pd.DataFrame(s1, columns=['x']) engine = Engine(df, n_models=8) engine.init_models() engine.run(100) y = engine.sample('x', n=300) plt.subplot(1, 2, 1) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model') plt.xlim([-10, 10]) engine_sub = Engine(df, n_models=8) engine_sub.init_models(.5) engine_sub.run(100) y = engine_sub.sample('x', n=300) plt.subplot(1, 2, 2) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model')
# how to model each column. engine = Engine(df, n_models=32) # We can see how baxcat decided to model each column by checking `col_info` col_info = engine.col_info() print(col_info) # To do inference, we intialize some cross-categorization states with # `init_models` then `run` the inference. We intitialize many models to hedge # the inferences we make. Every model is a draw from the posterior. We want to # make inference about the data given the posterior distribution of states, so # we take several models. print('Initializing 32 models...') engine.init_models() print('Running models for 200 iterations...') engine.run(200, checkpoint=5) # To check whether inference has converges, we plot the log score for each # model as a function of time and make sure they all have leveled out. engine.convergence_plot() plt.show() # We can view which columns are dependent on which other columns by plotting # a n_cols by n_cols matrix where each cell is the dependence probability # between two columns. Note that the dependence probability is simply the # probability that a dependence exists, not the strength of the dependence. engine.heatmap('dependence_probability', plot_kwargs={'figsize': (10, 10,)}) plt.show() engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)}) plt.show()
def row_to_img(df, row_idx): pixels = df.iloc[row_idx, 1:].values.reshape((28, 28)) return pixels assert __name__ == "__main__" exdir = os.path.dirname(os.path.realpath(__file__)) df = pd.read_csv(os.path.join(exdir, "mnist.csv.gz"), compression="gzip") df = df.sample(2000) testdata = df["label"][1500:] df["label"][1500:] = float("NaN") engine = Engine(df) engine.init_models(4) engine.run(1000, checkpoint=4, verbose=True) engine.convergence_plot() plt.show() _, m = engine.eval(testdata, metric=Accuracy()) print("Acuracy = %f" % (m,)) # engine.heatmap('row_similarity') # plt.show() # engine.heatmap('dependence_probability') # plt.show()
28, 28, )) return pixels assert __name__ == "__main__" exdir = os.path.dirname(os.path.realpath(__file__)) df = pd.read_csv(os.path.join(exdir, 'mnist.csv.gz'), compression='gzip') df = df.sample(2000) testdata = df['label'][1500:] df['label'][1500:] = float('NaN') engine = Engine(df) engine.init_models(4) engine.run(1000, checkpoint=4, verbose=True) engine.convergence_plot() plt.show() _, m = engine.eval(testdata, metric=Accuracy()) print('Acuracy = %f' % (m, )) # engine.heatmap('row_similarity') # plt.show() # engine.heatmap('dependence_probability') # plt.show()