def test_row_similarity_wrt(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine._models[0]['col_assignment'] = [0, 0, 1] engine._models[1]['col_assignment'] = [0, 0, 1] engine._models[2]['col_assignment'] = [0, 0, 1] engine._models[3]['col_assignment'] = [0, 0, 1] engine._models[0]['row_assignments'] = [[0] + [1]*29, [0]*30] engine._models[1]['row_assignments'] = [[0] + [1]*29, [0]*30] engine._models[2]['row_assignments'] = [[1]*29 + [0], [0]*30] engine._models[3]['row_assignments'] = [[1]*29 + [0], [0]*30] assert engine.row_similarity(0, 1, wrt=['c0']) == .5 assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
def test_engine_with_mapper_stl(gendf): engine = Engine(gendf(), n_models=4, mapper=lambda f, args: list(map(f, args))) engine.init_models() engine.run(2) assert len(engine.models) == 4
def test_row_similarity_wrt(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine._models[0]['col_assignment'] = [0, 0, 1] engine._models[1]['col_assignment'] = [0, 0, 1] engine._models[2]['col_assignment'] = [0, 0, 1] engine._models[3]['col_assignment'] = [0, 0, 1] engine._models[0]['row_assignments'] = [[0] + [1] * 29, [0] * 30] engine._models[1]['row_assignments'] = [[0] + [1] * 29, [0] * 30] engine._models[2]['row_assignments'] = [[1] * 29 + [0], [0] * 30] engine._models[3]['row_assignments'] = [[1] * 29 + [0], [0] * 30] assert engine.row_similarity(0, 1, wrt=['c0']) == .5 assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['continuous']*n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) engine = Engine(dg.df, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def test_save_smoke(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() with tempfile.NamedTemporaryFile('wb') as tf: engine.save(tf.name)
def test_engine_run_smoke_multiple(gendf): df = gendf() engine = Engine(df, n_models=10, use_mp=False) engine.init_models() engine.run() engine.run(10) assert len(engine.models) == 10
def test_engine_with_mapper_mp(gendf): pool = Pool() with Pool() as pool: engine = Engine(gendf(), n_models=4, mapper=pool.map) engine.init_models() engine.run(2) assert len(engine.models) == 4
def test_engine_with_mapper_ipyparallel(gendf): c = ipp.Client() v = c[:] engine = Engine(gendf(), n_models=4, mapper=v.map) engine.init_models() engine.run(2) assert len(engine.models) == 4
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['continuous'] * n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) engine = Engine(dg.df, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def gen_comp_engines(df, subsample_size): engine_full = Engine(df, n_models=8, use_mp=False) engine_full.init_models() engine_full.run(100) engine_mod = Engine(df, n_models=8, use_mp=False) engine_mod.init_models(subsample_size=subsample_size) engine_mod.run(100) return engine_full, engine_mod
def run(n_models=10, n_iter=200, iter_step=10, n_needles=2, n_distractors=8, n_rows=100, pairtype=None, pair_kws=None): needle_idxs = [(2*i, 2*i+1,) for i in range(n_needles)] needle_cols = list(range(n_needles*2)) distractor_cols = list(range(n_needles*2, n_needles*2+n_distractors)) combs = list(it.product(needle_cols, distractor_cols)) distractor_idxs = random.sample(combs, min(len(combs), 32)) df = _gen_data(n_needles, n_distractors, n_rows, pairtype, pair_kws) engine = Engine(df, n_models=n_models) engine.init_models() # for model in engine._models: # # XXX: emulates the log grid expected alpha # # e.g. mean(exp(linspace(log(1/n_rows), log(rows)))) # # model['state_alpha'] = .5*(n_needles*2. + n_distractors) # model['state_alpha'] = 100. # no column_alpha transition tlist = [b'row_assignment', b'column_assignment', b'row_alpha', b'column_hypers'] n_steps = int(n_iter/iter_step) needle_dps = np.zeros((n_needles, n_steps+1,)) distractor_dps = np.zeros((len(distractor_idxs), n_steps+1,)) for i in range(n_steps+1): engine.run(iter_step, trans_kwargs={'transition_list': tlist}) # engine.run(iter_step) for nidx, (a, b) in enumerate(needle_idxs): a = df.columns[a] b = df.columns[b] needle_dps[nidx, i] = engine.dependence_probability(a, b) for didx, (a, b) in enumerate(distractor_idxs): a = df.columns[a] b = df.columns[b] distractor_dps[didx, i] = engine.dependence_probability(a, b) iter_count = np.cumsum([1]+[iter_step]*n_steps) for y in distractor_dps: plt.plot(iter_count, y, color='gray', alpha=.3) for y in needle_dps: plt.plot(iter_count, y, color='crimson') # plt.gca().set_xscale('log') plt.ylim([-.05, 1.05]) plt.xlim([1, iter_count[-1]]) plt.show() engine.heatmap('dependence_probability') plt.show()
def test_engine_init_structureless(gendf): df = gendf() engine = Engine(df, n_models=4, use_mp=False) engine.init_models(structureless=True) assert len(engine._models) == 4 assert all([max(m['col_assignment']) == 0 for m in engine._models]) assert all([len(m['row_assignments']) == 1 for m in engine._models]) for m in engine._models: assert all([max(z) == 0 for z in m['row_assignments']])
def test_engine_init_smoke_metadata(gendf): df = gendf() metadata = dict() metadata['x_2'] = {'dtype': 'categorical', 'values': [-1, 0, 1, 99]} metadata['x_3'] = { 'dtype': 'categorical', 'values': ['zero', 'one', 'two', 'three', 'four'] } engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models()
def test_engine_init_smoke_metadata(gendf): df = gendf() metadata = dict() metadata['x_2'] = { 'dtype': 'categorical', 'values': [-1, 0, 1, 99]} metadata['x_3'] = { 'dtype': 'categorical', 'values': ['zero', 'one', 'two', 'three', 'four']} engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models()
def test_view_alpha_should_change_if_transition(gendf): df = gendf() engine = Engine(df, n_models=1, use_mp=False) engine.init_models() view_alpha_start = engine._models[0]['view_alphas'] engine.run(10) view_alpha_end = engine._models[0]['view_alphas'] assert view_alpha_start != view_alpha_end
def engine(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1) df.columns = ["c_%d" % i for i in range(12)] engine = Engine(df) engine.init_models(8) engine.run(20) return engine
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None): rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9] true_mis = np.zeros(len(rhos)) mis = np.zeros((n_times, len(rhos))) for i, rho in enumerate(rhos): print("Rho: %1.1f" % (rho,)) if vartype == "categorical": p, true_mi = _gen_categorical_joint_dist(rho, n_grid) metadata = { "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]}, } elif vartype == "continuous": true_mi = -0.5 * log(1.0 - rho ** 2.0) metadata = {} else: raise ValueError("invalid vartype") for t in range(n_times): if vartype == "categorical": x = _sample_from_bivariate_discrete(p, n) elif vartype == "continuous": sigma = np.array([[1, rho], [rho, 1]]) mu = np.zeros(2) x = np.random.multivariate_normal(mu, sigma, size=n) else: raise ValueError("invalid vartype") df = pd.DataFrame(x, columns=["x_1", "x_2"]) engine = Engine(df, n_models=1, metadata=metadata, use_mp=False) engine.init_models() engine.run(n_iter) true_mis[i] = true_mi mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False) if ax is not None: ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat") ax.plot(rhos, true_mis, label="True") ax.set_xlabel("rho") ax.set_ylabel("Mutual Information") ax.set_title(vartype) ax.legend(loc=0) else: return mis, true_mis
def test_view_alpha_should_not_change_if_no_transition(gendf): df = gendf() engine = Engine(df, n_models=1, use_mp=False) engine.init_models() view_alpha_start = engine._models[0]['view_alphas'] t_list = [b'row_assignment', b'column_alpha'] engine.run(10, trans_kwargs={'transition_list': t_list}) view_alpha_end = engine._models[0]['view_alphas'] assert view_alpha_start == view_alpha_end
def test_logp_scaling(df): engine = Engine(df) engine.init_models(8) engine.run(500) x = np.linspace(3, 7, 200) p_true = norm.pdf(x, loc=5., scale=.5) lp_baxcat = engine.probability(x[:, np.newaxis], ['t'], given=[('x', 1), ('y', 2)]) inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR) assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
def engine(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1) df.columns = ['c_%d' % i for i in range(12)] engine = Engine(df) engine.init_models(8) engine.run(20) return engine
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter): dg = DataGenerator(n_rows, ['categorical'] * n_cols, cat_weights=n_cats, cat_sep=cat_sep, seed=1337) col_md = {'dtype': 'categorical', 'values': [0, 1, 2, 3, 4]} md = dict(( col, col_md, ) for col in range(n_cols)) engine = Engine(dg.df, metadata=md, use_mp=False) engine.init_models(n_models) engine.run(n_iter) return dg, engine
def test_save_and_load_equivalence(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() with tempfile.NamedTemporaryFile('wb') as tf: engine.save(tf.name) new_engine = Engine.load(tf.name) assert engine._models == new_engine._models assert engine._dtypes == new_engine._dtypes assert engine._metadata == new_engine._metadata assert engine._converters == new_engine._converters assert engine._diagnostic_tables == new_engine._diagnostic_tables assert all(engine._row_names == new_engine._row_names) assert all(engine._col_names == new_engine._col_names)
def test_run_with_checkpoint_valid_diagnostic_output(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() engine.run(10, checkpoint=5) tables = engine._diagnostic_tables assert len(tables) == 5 for table in tables: assert len(table) == 3 for entry in table: assert 'log_score' in entry assert 'iters' in entry assert 'time' in entry
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None): xo, yo = shapefunc(n) s1 = pd.Series(xo) s2 = pd.Series(yo) df = pd.concat([s1, s2], axis=1) df.columns = ['x', 'y'] engine = Engine(df, n_models=n_models, use_mp=True) engine.init_models(subsample_size=subsample_size) engine.run(n_iter) xy = engine.sample(['x', 'y'], n=n) xe = xy[:, 0] ye = xy[:, 1] return xo, yo, xe, ye
def test_run_on_model_subset_should_only_run_those_models(gendf): df = gendf() engine = Engine(df, n_models=5, use_mp=False) engine.init_models() engine.run(10, checkpoint=5) engine.run(10, checkpoint=5, model_idxs=[1, 2]) tables = engine._diagnostic_tables assert len(tables) == 5 assert len(tables[0]) == 3 assert len(tables[1]) == 5 assert len(tables[2]) == 5 assert len(tables[3]) == 3 assert len(tables[4]) == 3
def test_dependence_probability(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=20, use_mp=False) engine.init_models() engine.run(10) depprob_01 = engine.dependence_probability('c0', 'c1') depprob_02 = engine.dependence_probability('c0', 'c2') depprob_12 = engine.dependence_probability('c1', 'c2') assert depprob_01 > depprob_02 assert depprob_01 > depprob_12
def test_pairwise_dependence_probability(): x = np.random.randn(30) s1 = pd.Series(x) s2 = pd.Series(x + 1.0) s3 = pd.Series(np.random.rand(30)) df = pd.concat([s1, s2, s3], axis=1) df.columns = ['c0', 'c1', 'c2'] engine = Engine(df, n_models=10, use_mp=False) engine.init_models() engine.run(5) depprob = engine.pairwise_func('dependence_probability') assert depprob.ix[0, 0] == 1. assert depprob.ix[1, 1] == 1. assert depprob.ix[2, 2] == 1. assert depprob.ix[0, 1] == depprob.ix[1, 0] assert depprob.ix[0, 2] == depprob.ix[2, 0] assert depprob.ix[1, 2] == depprob.ix[2, 1]
s_b1 = pd.Series(np.ones(n, dtype=int)) s_b2 = pd.Series(np.random.randn(n) + 2.) df = pd.concat( [pd.concat([s_a1, s_a2], axis=1), pd.concat([s_b1, s_b2], axis=1)], axis=0) assert df.shape == ( 2 * n, 2, ) df.columns = ['label', 'x'] engine = Engine(df, n_models=8) engine.init_models() engine.run(200) x = np.linspace(-6., 6., 200)[np.newaxis].T p_01 = np.exp(engine.probability(x, ['x'])) p_0 = .5 * np.exp(engine.probability(x, ['x'], given=[( 'label', 0, )])) p_1 = .5 * np.exp(engine.probability(x, ['x'], given=[( 'label', 1, )])) plt.figure(figsize=(
def gen_engine_full(df): engine = Engine(df, n_models=4, use_mp=False) engine.init_models() engine.run(10) return engine
def gen_engine_half(df): engine = Engine(df, n_models=4, use_mp=False) engine.init_models(subsample_size=0.5) engine.run(10) return engine
def gen_engine(df): engine = Engine(df, n_models=2, use_mp=False) engine.init_models() engine.run(10) # print(engine.col_info()) return engine
s_a1 = pd.Series(np.zeros(n, dtype=int)) s_a2 = pd.Series(np.random.randn(n) - 2.) s_b1 = pd.Series(np.ones(n, dtype=int)) s_b2 = pd.Series(np.random.randn(n) + 2.) df = pd.concat([pd.concat([s_a1, s_a2], axis=1), pd.concat([s_b1, s_b2], axis=1)], axis=0) assert df.shape == (2*n, 2,) df.columns = ['label', 'x'] engine = Engine(df, n_models=8) engine.init_models() engine.run(200) x = np.linspace(-6., 6., 200)[np.newaxis].T p_01 = np.exp(engine.probability(x, ['x'])) p_0 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 0,)])) p_1 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 1,)])) plt.figure(figsize=(4, 4,)) plt.hist(df['x'], 31, histtype='stepfilled', color='#aaaaaa', edgecolor='None', normed=True) plt.plot(x.flatten(), p_0, label='p(x|label=0)') plt.plot(x.flatten(), p_1, label='p(x|label=1)') plt.plot(x.flatten(), p_01, ls='--', label='p(x)') plt.xlabel('x')
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from baxcat.engine import Engine x = np.hstack(( np.random.randn(100) - 6, np.random.randn(100)*3, np.random.randn(100) + 6,)) s1 = pd.Series(x) df = pd.DataFrame(s1, columns=['x']) engine = Engine(df, n_models=8) engine.init_models() engine.run(100) y = engine.sample('x', n=300) plt.subplot(1, 2, 1) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model') plt.xlim([-10, 10]) engine_sub = Engine(df, n_models=8) engine_sub.init_models(.5) engine_sub.run(100) y = engine_sub.sample('x', n=300) plt.subplot(1, 2, 2) sns.distplot(x, bins=30, label='original')
def test_engine_init_from_filename(gendf): df = gendf() with NamedTemporaryFile() as tf: df.to_csv(tf.name) engine = Engine(tf.name, n_models=1, use_mp=False) engine.init_models()
28, 28, )) return pixels assert __name__ == "__main__" exdir = os.path.dirname(os.path.realpath(__file__)) df = pd.read_csv(os.path.join(exdir, 'mnist.csv.gz'), compression='gzip') df = df.sample(2000) testdata = df['label'][1500:] df['label'][1500:] = float('NaN') engine = Engine(df) engine.init_models(4) engine.run(1000, checkpoint=4, verbose=True) engine.convergence_plot() plt.show() _, m = engine.eval(testdata, metric=Accuracy()) print('Acuracy = %f' % (m, )) # engine.heatmap('row_similarity') # plt.show() # engine.heatmap('dependence_probability') # plt.show()
def test_engine_init_smoke_default(gendf): df = gendf() engine = Engine(df, n_models=1, use_mp=False) engine.init_models()
import matplotlib.pyplot as plt import seaborn as sns from baxcat.engine import Engine x = np.hstack(( np.random.randn(100) - 6, np.random.randn(100) * 3, np.random.randn(100) + 6, )) s1 = pd.Series(x) df = pd.DataFrame(s1, columns=['x']) engine = Engine(df, n_models=8) engine.init_models() engine.run(100) y = engine.sample('x', n=300) plt.subplot(1, 2, 1) sns.distplot(x, bins=30, label='original') sns.distplot(y, bins=30, label='model') plt.xlim([-10, 10]) engine_sub = Engine(df, n_models=8) engine_sub.init_models(.5) engine_sub.run(100) y = engine_sub.sample('x', n=300) plt.subplot(1, 2, 2) sns.distplot(x, bins=30, label='original')
def row_to_img(df, row_idx): pixels = df.iloc[row_idx, 1:].values.reshape((28, 28)) return pixels assert __name__ == "__main__" exdir = os.path.dirname(os.path.realpath(__file__)) df = pd.read_csv(os.path.join(exdir, "mnist.csv.gz"), compression="gzip") df = df.sample(2000) testdata = df["label"][1500:] df["label"][1500:] = float("NaN") engine = Engine(df) engine.init_models(4) engine.run(1000, checkpoint=4, verbose=True) engine.convergence_plot() plt.show() _, m = engine.eval(testdata, metric=Accuracy()) print("Acuracy = %f" % (m,)) # engine.heatmap('row_similarity') # plt.show() # engine.heatmap('dependence_probability') # plt.show()