예제 #1
0
def test_engine_with_mapper_stl(gendf):
    engine = Engine(gendf(), n_models=4,
                    mapper=lambda f, args: list(map(f, args)))
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
예제 #2
0
def test_save_smoke(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()

    with tempfile.NamedTemporaryFile('wb') as tf:
        engine.save(tf.name)
예제 #3
0
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['continuous']*n_cols, cat_weights=n_cats,
                       cat_sep=cat_sep, seed=1337)
    engine = Engine(dg.df, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
예제 #4
0
def test_engine_with_mapper_mp(gendf):
    pool = Pool()

    with Pool() as pool:
        engine = Engine(gendf(), n_models=4, mapper=pool.map)
        engine.init_models()
        engine.run(2)

        assert len(engine.models) == 4
예제 #5
0
def test_engine_with_mapper_ipyparallel(gendf):
    c = ipp.Client()
    v = c[:]

    engine = Engine(gendf(), n_models=4, mapper=v.map)
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
예제 #6
0
def gen_comp_engines(df, subsample_size):
    engine_full = Engine(df, n_models=8, use_mp=False)
    engine_full.init_models()
    engine_full.run(100)

    engine_mod = Engine(df, n_models=8, use_mp=False)
    engine_mod.init_models(subsample_size=subsample_size)
    engine_mod.run(100)

    return engine_full, engine_mod
예제 #7
0
def run(n_models=10, n_iter=200, iter_step=10, n_needles=2, n_distractors=8,
        n_rows=100, pairtype=None, pair_kws=None):

    needle_idxs = [(2*i, 2*i+1,) for i in range(n_needles)]
    needle_cols = list(range(n_needles*2))
    distractor_cols = list(range(n_needles*2, n_needles*2+n_distractors))
    combs = list(it.product(needle_cols, distractor_cols))
    distractor_idxs = random.sample(combs, min(len(combs), 32))

    df = _gen_data(n_needles, n_distractors, n_rows, pairtype, pair_kws)

    engine = Engine(df, n_models=n_models)
    engine.init_models()
    # for model in engine._models:
    #     # XXX: emulates the log grid expected alpha
    #     # e.g. mean(exp(linspace(log(1/n_rows), log(rows))))
    #     # model['state_alpha'] = .5*(n_needles*2. + n_distractors)
    #     model['state_alpha'] = 100.

    # no column_alpha transition
    tlist = [b'row_assignment', b'column_assignment', b'row_alpha',
             b'column_hypers']

    n_steps = int(n_iter/iter_step)
    needle_dps = np.zeros((n_needles, n_steps+1,))
    distractor_dps = np.zeros((len(distractor_idxs), n_steps+1,))
    for i in range(n_steps+1):
        engine.run(iter_step, trans_kwargs={'transition_list': tlist})
        # engine.run(iter_step)

        for nidx, (a, b) in enumerate(needle_idxs):
            a = df.columns[a]
            b = df.columns[b]
            needle_dps[nidx, i] = engine.dependence_probability(a, b)

        for didx, (a, b) in enumerate(distractor_idxs):
            a = df.columns[a]
            b = df.columns[b]
            distractor_dps[didx, i] = engine.dependence_probability(a, b)

    iter_count = np.cumsum([1]+[iter_step]*n_steps)

    for y in distractor_dps:
        plt.plot(iter_count, y, color='gray', alpha=.3)

    for y in needle_dps:
        plt.plot(iter_count, y, color='crimson')

    # plt.gca().set_xscale('log')
    plt.ylim([-.05, 1.05])
    plt.xlim([1, iter_count[-1]])
    plt.show()

    engine.heatmap('dependence_probability')
    plt.show()
예제 #8
0
def test_engine_init_structureless(gendf):
    df = gendf()

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(structureless=True)

    assert len(engine._models) == 4
    assert all([max(m['col_assignment']) == 0 for m in engine._models])
    assert all([len(m['row_assignments']) == 1 for m in engine._models])
    for m in engine._models:
        assert all([max(z) == 0 for z in m['row_assignments']])
예제 #9
0
def test_engine_init_structureless(gendf):
    df = gendf()

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(structureless=True)

    assert len(engine._models) == 4
    assert all([max(m['col_assignment']) == 0 for m in engine._models])
    assert all([len(m['row_assignments']) == 1 for m in engine._models])
    for m in engine._models:
        assert all([max(z) == 0 for z in m['row_assignments']])
예제 #10
0
def test_engine_init_smoke_metadata(gendf):
    df = gendf()

    metadata = dict()
    metadata['x_2'] = {'dtype': 'categorical', 'values': [-1, 0, 1, 99]}
    metadata['x_3'] = {
        'dtype': 'categorical',
        'values': ['zero', 'one', 'two', 'three', 'four']
    }

    engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
    engine.init_models()
예제 #11
0
def engine():
    x = np.random.randn(30)
    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)

    df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1)
    df.columns = ["c_%d" % i for i in range(12)]

    engine = Engine(df)
    engine.init_models(8)
    engine.run(20)

    return engine
예제 #12
0
def test_engine_init_smoke_metadata(gendf):
    df = gendf()

    metadata = dict()
    metadata['x_2'] = {
        'dtype': 'categorical',
        'values': [-1, 0, 1, 99]}
    metadata['x_3'] = {
        'dtype': 'categorical',
        'values': ['zero', 'one', 'two', 'three', 'four']}

    engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
    engine.init_models()
예제 #13
0
def test_view_alpha_should_change_if_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    engine.run(10)

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start != view_alpha_end
예제 #14
0
def test_view_alpha_should_not_change_if_no_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    t_list = [b'row_assignment', b'column_alpha']
    engine.run(10, trans_kwargs={'transition_list': t_list})

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start == view_alpha_end
예제 #15
0
def test_load_smoke(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()

    with tempfile.NamedTemporaryFile('wb') as tf:
        engine.save(tf.name)
        Engine.load(tf.name)
예제 #16
0
def test_engine_run_smoke_multiple(gendf):
    df = gendf()

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run()
    engine.run(10)

    assert len(engine.models) == 10
예제 #17
0
def test_run_with_checkpoint_valid_diagnostic_output(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    for table in tables:
        assert len(table) == 3
        for entry in table:
            assert 'log_score' in entry
            assert 'iters' in entry
            assert 'time' in entry
예제 #18
0
def test_engine_run_smoke_multiple(gendf):
    df = gendf()

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run()
    engine.run(10)

    assert len(engine.models) == 10
예제 #19
0
def test_engine_with_mapper_stl(gendf):
    engine = Engine(gendf(),
                    n_models=4,
                    mapper=lambda f, args: list(map(f, args)))
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
예제 #20
0
def test_engine_with_mapper_mp(gendf):
    pool = Pool()

    with Pool() as pool:
        engine = Engine(gendf(), n_models=4, mapper=pool.map)
        engine.init_models()
        engine.run(2)

        assert len(engine.models) == 4
예제 #21
0
def test_engine_with_mapper_ipyparallel(gendf):
    c = ipp.Client()
    v = c[:]

    engine = Engine(gendf(), n_models=4, mapper=v.map)
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
예제 #22
0
def test_run_on_model_subset_should_only_run_those_models(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)
    engine.run(10, checkpoint=5, model_idxs=[1, 2])

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    assert len(tables[0]) == 3
    assert len(tables[1]) == 5
    assert len(tables[2]) == 5
    assert len(tables[3]) == 3
    assert len(tables[4]) == 3
예제 #23
0
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['continuous'] * n_cols,
                       cat_weights=n_cats,
                       cat_sep=cat_sep,
                       seed=1337)
    engine = Engine(dg.df, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
예제 #24
0
def test_logp_scaling(df):
    engine = Engine(df)
    engine.init_models(8)
    engine.run(500)

    x = np.linspace(3, 7, 200)

    p_true = norm.pdf(x, loc=5., scale=.5)
    lp_baxcat = engine.probability(x[:, np.newaxis], ['t'],
                                   given=[('x', 1), ('y', 2)])

    inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR)

    assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
예제 #25
0
def test_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=20, use_mp=False)
    engine.init_models()
    engine.run(10)
    depprob_01 = engine.dependence_probability('c0', 'c1')
    depprob_02 = engine.dependence_probability('c0', 'c2')
    depprob_12 = engine.dependence_probability('c1', 'c2')

    assert depprob_01 > depprob_02
    assert depprob_01 > depprob_12
예제 #26
0
def test_view_alpha_should_change_if_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    engine.run(10)

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start != view_alpha_end
예제 #27
0
def test_save_and_load_equivalence(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()

    with tempfile.NamedTemporaryFile('wb') as tf:
        engine.save(tf.name)
        new_engine = Engine.load(tf.name)

        assert engine._models == new_engine._models
        assert engine._dtypes == new_engine._dtypes
        assert engine._metadata == new_engine._metadata
        assert engine._converters == new_engine._converters
        assert engine._diagnostic_tables == new_engine._diagnostic_tables
        assert all(engine._row_names == new_engine._row_names)
        assert all(engine._col_names == new_engine._col_names)
예제 #28
0
def test_run_on_model_subset_should_only_run_those_models(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)
    engine.run(10, checkpoint=5, model_idxs=[1, 2])

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    assert len(tables[0]) == 3
    assert len(tables[1]) == 5
    assert len(tables[2]) == 5
    assert len(tables[3]) == 3
    assert len(tables[4]) == 3
예제 #29
0
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None):
    xo, yo = shapefunc(n)

    s1 = pd.Series(xo)
    s2 = pd.Series(yo)
    df = pd.concat([s1, s2], axis=1)
    df.columns = ['x', 'y']

    engine = Engine(df, n_models=n_models, use_mp=True)
    engine.init_models(subsample_size=subsample_size)
    engine.run(n_iter)

    xy = engine.sample(['x', 'y'], n=n)
    xe = xy[:, 0]
    ye = xy[:, 1]

    return xo, yo, xe, ye
예제 #30
0
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None):

    rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9]

    true_mis = np.zeros(len(rhos))
    mis = np.zeros((n_times, len(rhos)))

    for i, rho in enumerate(rhos):
        print("Rho: %1.1f" % (rho,))

        if vartype == "categorical":
            p, true_mi = _gen_categorical_joint_dist(rho, n_grid)
            metadata = {
                "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
                "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
            }
        elif vartype == "continuous":
            true_mi = -0.5 * log(1.0 - rho ** 2.0)
            metadata = {}
        else:
            raise ValueError("invalid vartype")

        for t in range(n_times):
            if vartype == "categorical":
                x = _sample_from_bivariate_discrete(p, n)
            elif vartype == "continuous":
                sigma = np.array([[1, rho], [rho, 1]])
                mu = np.zeros(2)
                x = np.random.multivariate_normal(mu, sigma, size=n)
            else:
                raise ValueError("invalid vartype")

            df = pd.DataFrame(x, columns=["x_1", "x_2"])

            engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
            engine.init_models()
            engine.run(n_iter)

            true_mis[i] = true_mi
            mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False)

    if ax is not None:
        ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat")
        ax.plot(rhos, true_mis, label="True")

        ax.set_xlabel("rho")
        ax.set_ylabel("Mutual Information")
        ax.set_title(vartype)
        ax.legend(loc=0)
    else:
        return mis, true_mis
예제 #31
0
def test_view_alpha_should_not_change_if_no_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    t_list = [b'row_assignment', b'column_alpha']
    engine.run(10, trans_kwargs={'transition_list': t_list})

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start == view_alpha_end
예제 #32
0
def test_logp_scaling(df):
    engine = Engine(df)
    engine.init_models(8)
    engine.run(500)

    x = np.linspace(3, 7, 200)

    p_true = norm.pdf(x, loc=5., scale=.5)
    lp_baxcat = engine.probability(x[:, np.newaxis], ['t'],
                                   given=[('x', 1), ('y', 2)])

    inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR)

    assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
예제 #33
0
def engine():
    x = np.random.randn(30)
    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)

    df = pd.concat([s1, s2] +
                   [pd.Series(np.random.rand(30)) for _ in range(10)],
                   axis=1)
    df.columns = ['c_%d' % i for i in range(12)]

    engine = Engine(df)
    engine.init_models(8)
    engine.run(20)

    return engine
예제 #34
0
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['categorical'] * n_cols,
                       cat_weights=n_cats,
                       cat_sep=cat_sep,
                       seed=1337)
    col_md = {'dtype': 'categorical', 'values': [0, 1, 2, 3, 4]}
    md = dict((
        col,
        col_md,
    ) for col in range(n_cols))
    engine = Engine(dg.df, metadata=md, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
예제 #35
0
def test_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=20, use_mp=False)
    engine.init_models()
    engine.run(10)
    depprob_01 = engine.dependence_probability('c0', 'c1')
    depprob_02 = engine.dependence_probability('c0', 'c2')
    depprob_12 = engine.dependence_probability('c1', 'c2')

    assert depprob_01 > depprob_02
    assert depprob_01 > depprob_12
예제 #36
0
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None):
    xo, yo = shapefunc(n)

    s1 = pd.Series(xo)
    s2 = pd.Series(yo)
    df = pd.concat([s1, s2], axis=1)
    df.columns = ['x', 'y']

    engine = Engine(df, n_models=n_models, use_mp=True)
    engine.init_models(subsample_size=subsample_size)
    engine.run(n_iter)

    xy = engine.sample(['x', 'y'], n=n)
    xe = xy[:, 0]
    ye = xy[:, 1]

    return xo, yo, xe, ye
예제 #37
0
def test_run_with_checkpoint_valid_diagnostic_output(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    for table in tables:
        assert len(table) == 3
        for entry in table:
            assert 'log_score' in entry
            assert 'iters' in entry
            assert 'time' in entry
예제 #38
0
def test_save_and_load_equivalence(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()

    with tempfile.NamedTemporaryFile('wb') as tf:
        engine.save(tf.name)
        new_engine = Engine.load(tf.name)

        assert engine._models == new_engine._models
        assert engine._dtypes == new_engine._dtypes
        assert engine._metadata == new_engine._metadata
        assert engine._converters == new_engine._converters
        assert engine._diagnostic_tables == new_engine._diagnostic_tables
        assert all(engine._row_names == new_engine._row_names)
        assert all(engine._col_names == new_engine._col_names)
예제 #39
0
def test_pairwise_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run(5)

    depprob = engine.pairwise_func('dependence_probability')
    assert depprob.ix[0, 0] == 1.
    assert depprob.ix[1, 1] == 1.
    assert depprob.ix[2, 2] == 1.

    assert depprob.ix[0, 1] == depprob.ix[1, 0]
    assert depprob.ix[0, 2] == depprob.ix[2, 0]
    assert depprob.ix[1, 2] == depprob.ix[2, 1]
예제 #40
0
def test_pairwise_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run(5)

    depprob = engine.pairwise_func('dependence_probability')
    assert depprob.ix[0, 0] == 1.
    assert depprob.ix[1, 1] == 1.
    assert depprob.ix[2, 2] == 1.

    assert depprob.ix[0, 1] == depprob.ix[1, 0]
    assert depprob.ix[0, 2] == depprob.ix[2, 0]
    assert depprob.ix[1, 2] == depprob.ix[2, 1]
예제 #41
0
def test_row_similarity_wrt():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()

    engine._models[0]['col_assignment'] = [0, 0, 1]
    engine._models[1]['col_assignment'] = [0, 0, 1]
    engine._models[2]['col_assignment'] = [0, 0, 1]
    engine._models[3]['col_assignment'] = [0, 0, 1]

    engine._models[0]['row_assignments'] = [[0] + [1]*29, [0]*30]
    engine._models[1]['row_assignments'] = [[0] + [1]*29, [0]*30]
    engine._models[2]['row_assignments'] = [[1]*29 + [0], [0]*30]
    engine._models[3]['row_assignments'] = [[1]*29 + [0], [0]*30]

    assert engine.row_similarity(0, 1, wrt=['c0']) == .5
    assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
예제 #42
0
def test_engine_init_from_filename(gendf):
    df = gendf()
    with NamedTemporaryFile() as tf:
        df.to_csv(tf.name)
        engine = Engine(tf.name, n_models=1, use_mp=False)
        engine.init_models()
예제 #43
0
        28,
        28,
    ))
    return pixels


assert __name__ == "__main__"

exdir = os.path.dirname(os.path.realpath(__file__))

df = pd.read_csv(os.path.join(exdir, 'mnist.csv.gz'), compression='gzip')
df = df.sample(2000)
testdata = df['label'][1500:]
df['label'][1500:] = float('NaN')

engine = Engine(df)
engine.init_models(4)
engine.run(1000, checkpoint=4, verbose=True)

engine.convergence_plot()
plt.show()

_, m = engine.eval(testdata, metric=Accuracy())
print('Acuracy = %f' % (m, ))

# engine.heatmap('row_similarity')
# plt.show()

# engine.heatmap('dependence_probability')
# plt.show()
예제 #44
0
def gen_engine_half(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(subsample_size=0.5)
    engine.run(10)

    return engine
예제 #45
0
def test_row_similarity_wrt():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()

    engine._models[0]['col_assignment'] = [0, 0, 1]
    engine._models[1]['col_assignment'] = [0, 0, 1]
    engine._models[2]['col_assignment'] = [0, 0, 1]
    engine._models[3]['col_assignment'] = [0, 0, 1]

    engine._models[0]['row_assignments'] = [[0] + [1] * 29, [0] * 30]
    engine._models[1]['row_assignments'] = [[0] + [1] * 29, [0] * 30]
    engine._models[2]['row_assignments'] = [[1] * 29 + [0], [0] * 30]
    engine._models[3]['row_assignments'] = [[1] * 29 + [0], [0] * 30]

    assert engine.row_similarity(0, 1, wrt=['c0']) == .5
    assert engine.row_similarity(0, 1, wrt=['c2']) == 1.
예제 #46
0
from math import exp

import pandas as pd
import matplotlib.pyplot as plt


assert __name__ == "__main__"

exdir = os.path.dirname(os.path.realpath(__file__))

df = pd.read_csv(os.path.join(exdir, 'animals.csv'), index_col=0)


# Let's create out engine. Well just pass in the data and let baxcat decide
# how to model each column.
engine = Engine(df, n_models=32)

# We can see how baxcat decided to model each column by checking `col_info`
col_info = engine.col_info()
print(col_info)

# To do inference, we intialize some cross-categorization states with
# `init_models` then `run` the inference. We intitialize many models to hedge
# the inferences we make. Every model is a draw from the posterior. We want to
# make inference about the data given the posterior distribution of states, so
# we take several models.
print('Initializing 32 models...')
engine.init_models()
print('Running models for 200 iterations...')
engine.run(200, checkpoint=5)
예제 #47
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from baxcat.engine import Engine

x = np.hstack((
    np.random.randn(100) - 6,
    np.random.randn(100)*3,
    np.random.randn(100) + 6,))

s1 = pd.Series(x)
df = pd.DataFrame(s1, columns=['x'])

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(100)
y = engine.sample('x', n=300)

plt.subplot(1, 2, 1)
sns.distplot(x, bins=30, label='original')
sns.distplot(y, bins=30, label='model')
plt.xlim([-10, 10])

engine_sub = Engine(df, n_models=8)
engine_sub.init_models(.5)
engine_sub.run(100)
y = engine_sub.sample('x', n=300)

plt.subplot(1, 2, 2)
예제 #48
0
def gen_engine_full(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()
    engine.run(10)

    return engine
예제 #49
0
def test_engine_init_smoke_default(gendf):
    df = gendf()
    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()
예제 #50
0
        fname = func.__name__
        s = pd.Series([fname] * n)
        df = pd.concat([s, pd.Series(xo), pd.Series(yo)], axis=1)
        df.columns = ['func', 'x', 'y']
        dfs.append(df)

        ax = axes[0, i]
        ax.scatter(xo, yo, color='crimson', alpha=.3)

        ax = axes[1, i]
        ax.scatter(xe, ye, color='gray', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())

    df = pd.concat(dfs, ignore_index=True)
    engine = Engine(df, n_models=8)
    engine.init_models()
    engine.run(1000, checkpoint=20)

    dfs = []
    for i, func in enumerate(funcs):
        func_name = func.__name__
        x = engine.sample(['x', 'y'], given=[('func', func_name)], n=n)

        ax = axes[2, i]
        ax.scatter(x[:, 0], x[:, 1], color='navy', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())
    plt.show()
예제 #51
0
def test_engine_init_from_filename(gendf):
    df = gendf()
    with NamedTemporaryFile() as tf:
        df.to_csv(tf.name)
        engine = Engine(tf.name, n_models=1, use_mp=False)
        engine.init_models()
예제 #52
0
def gen_engine(df):
    engine = Engine(df, n_models=2, use_mp=False)
    engine.init_models()
    engine.run(10)
    # print(engine.col_info())
    return engine
예제 #53
0

def gen_mixture_data(n, mprop=.1):
    """ Generate 2-feature mixture data """
    x = np.zeros((n, 2,))
    weights = [0.3, 0.7]
    mu = [-1.0, 3.0]
    for i in range(n):
        k = pflip(weights)
        m = mu[k]
        x[i, :] = np.random.normal(m, size=2)

    df = pd.DataFrame(x)
    df.columns = ['x_1', 'x_2']

    return df


df = gen_mixture_data(200)

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(200)

# find max value in x_1
amin = np.argmin(np.abs(df['x_1']-3.0))

resimp = engine.impute('x_1', [amin])

print(resimp)
예제 #54
0
def test_engine_init_smoke_default(gendf):
    df = gendf()
    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()
예제 #55
0
s_a1 = pd.Series(np.zeros(n, dtype=int))
s_a2 = pd.Series(np.random.randn(n) - 2.)

s_b1 = pd.Series(np.ones(n, dtype=int))
s_b2 = pd.Series(np.random.randn(n) + 2.)


df = pd.concat([pd.concat([s_a1, s_a2], axis=1),
                pd.concat([s_b1, s_b2], axis=1)], axis=0)
assert df.shape == (2*n, 2,)

df.columns = ['label', 'x']


engine = Engine(df, n_models=8)
engine.init_models()
engine.run(200)

x = np.linspace(-6., 6., 200)[np.newaxis].T

p_01 = np.exp(engine.probability(x, ['x']))
p_0 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 0,)]))
p_1 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 1,)]))

plt.figure(figsize=(4, 4,))
plt.hist(df['x'], 31, histtype='stepfilled', color='#aaaaaa', edgecolor='None',
         normed=True)
plt.plot(x.flatten(), p_0, label='p(x|label=0)')
plt.plot(x.flatten(), p_1, label='p(x|label=1)')
plt.plot(x.flatten(), p_01, ls='--', label='p(x)')
예제 #56
0
def gen_comp_engines(df, subsample_size):
    engine_full = Engine(df, n_models=8, use_mp=False)
    engine_full.init_models()
    engine_full.run(100)

    engine_mod = Engine(df, n_models=8, use_mp=False)
    engine_mod.init_models(subsample_size=subsample_size)
    engine_mod.run(100)

    return engine_full, engine_mod
예제 #57
0
def gen_engine_half(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(subsample_size=0.5)
    engine.run(10)

    return engine
예제 #58
0
s_a2 = pd.Series(np.random.randn(n) - 2.)

s_b1 = pd.Series(np.ones(n, dtype=int))
s_b2 = pd.Series(np.random.randn(n) + 2.)

df = pd.concat(
    [pd.concat([s_a1, s_a2], axis=1),
     pd.concat([s_b1, s_b2], axis=1)], axis=0)
assert df.shape == (
    2 * n,
    2,
)

df.columns = ['label', 'x']

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(200)

x = np.linspace(-6., 6., 200)[np.newaxis].T

p_01 = np.exp(engine.probability(x, ['x']))
p_0 = .5 * np.exp(engine.probability(x, ['x'], given=[(
    'label',
    0,
)]))
p_1 = .5 * np.exp(engine.probability(x, ['x'], given=[(
    'label',
    1,
)]))
예제 #59
0
def gen_engine(df):
    engine = Engine(df, n_models=2, use_mp=False)
    engine.init_models()
    engine.run(10)
    # print(engine.col_info())
    return engine