コード例 #1
0
def test_two_views_column_partition_normal__ci_(lovecat):
    D = retrieve_normal_dataset()

    engine = Engine(D.T,
                    outputs=[5, 0, 1, 2, 3, 4],
                    cctypes=['normal'] * len(D),
                    rng=gu.gen_rng(12),
                    num_states=64)

    if lovecat:
        engine.transition_lovecat(N=200)
    else:
        engine.transition(N=200)

    P = engine.dependence_probability_pairwise()
    R1 = engine.row_similarity_pairwise(cols=[5, 0, 1])
    R2 = engine.row_similarity_pairwise(cols=[2, 3, 4])

    pu.plot_clustermap(P)
    pu.plot_clustermap(R1)
    pu.plot_clustermap(R2)

    P_THEORY = [
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]
    return engine
コード例 #2
0
def state():
    # Create an engine.
    engine = Engine(DATA,
                    cctypes=['normal', 'categorical'],
                    distargs=[None, {
                        'k': 6
                    }],
                    num_states=4,
                    rng=gu.gen_rng(212))
    engine.transition(N=15)
    marginals = engine.logpdf_score()
    ranking = np.argsort(marginals)[::-1]
    return engine.get_state(ranking[0])
コード例 #3
0
def test_logpdf_score_crash():
    rng = gen_rng(8)
    # T = rng.choice([0,1], p=[.3,.7], size=250).reshape(-1,1)
    T = rng.normal(size=30).reshape(-1, 1)
    engine = Engine(T, cctypes=['normal'], rng=rng, num_states=4)
    logpdf_likelihood_initial = np.array(engine.logpdf_likelihood())
    logpdf_score_initial = np.array(engine.logpdf_score())
    assert np.all(logpdf_score_initial < logpdf_likelihood_initial)
    # assert np.all(logpdf_likelihood_initial < logpdf_score_initial)
    engine.transition(N=100)
    engine.transition(kernels=['column_hypers', 'view_alphas'], N=10)
    logpdf_likelihood_final = np.asarray(engine.logpdf_likelihood())
    logpdf_score_final = np.asarray(engine.logpdf_score())
    assert np.all(logpdf_score_final < logpdf_likelihood_final)
    assert np.max(logpdf_score_initial) < np.max(logpdf_score_final)
コード例 #4
0
def launch_analysis():
    engine = Engine(animals.values.astype(float),
                    num_states=64,
                    cctypes=['categorical'] * len(animals.values[0]),
                    distargs=[{
                        'k': 2
                    }] * len(animals.values[0]),
                    rng=gu.gen_rng(7))

    engine.transition(N=900)
    with open('resources/animals/animals.engine', 'w') as f:
        engine.to_pickle(f)

    engine = Engine.from_pickle(open('resources/animals/animals.engine', 'r'))
    D = engine.dependence_probability_pairwise()
    pu.plot_clustermap(D)
コード例 #5
0
def test_two_views_row_partition_bernoulli__ci_(lovecat):
    D = retrieve_bernoulli_dataset()

    if lovecat:
        engine = Engine(D.T,
                        cctypes=['categorical'] * len(D),
                        distargs=[{
                            'k': 2
                        }] * len(D),
                        Zv={
                            0: 0,
                            1: 0,
                            2: 1,
                            3: 1
                        },
                        rng=gu.gen_rng(12),
                        num_states=64)
        engine.transition_lovecat(N=100,
                                  kernels=[
                                      'row_partition_assignments',
                                      'row_partition_hyperparameters',
                                      'column_hyperparameters',
                                  ])
    else:
        engine = Engine(D.T,
                        cctypes=['bernoulli'] * len(D),
                        Zv={
                            0: 0,
                            1: 0,
                            2: 1,
                            3: 1
                        },
                        rng=gu.gen_rng(12),
                        num_states=64)
        engine.transition(N=100,
                          kernels=[
                              'view_alphas',
                              'rows',
                              'column_hypers',
                          ])

    R1 = engine.row_similarity_pairwise(cols=[0, 1])
    R2 = engine.row_similarity_pairwise(cols=[2, 3])

    pu.plot_clustermap(R1)
    pu.plot_clustermap(R2)
    return engine
コード例 #6
0
ファイル: recover.py プロジェクト: wilsondy/cgpm
def run_test(args):
    n_rows = args["num_rows"]
    n_iters = args["num_iters"]
    n_chains = args["num_chains"]

    n_per_chain = int(float(n_rows) / n_chains)

    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 9))
    axes = axes.ravel()
    k = 0
    for shape in shapes:
        print "Shape: %s" % shape
        T_o = np.asarray(gen_function[shape](n_rows))
        T_i = []

        engine = Engine(T_o.T,
                        cctypes=cctypes,
                        distargs=distargs,
                        num_states=n_chains)
        engine.transition(N=n_iters)

        for chain in xrange(n_chains):
            state = engine.get_state(chain)
            print "chain %i of %i" % (chain + 1, n_chains)
            T_i.extend(state.simulate(-1, [0, 1], N=n_per_chain))

        T_i = np.array(T_i)

        ax = axes[k]
        ax.scatter(T_o[0], T_o[1], color='blue', edgecolor='none')
        ax.set_xlabel("X")
        ax.set_ylabel("Y")
        ax.set_title("%s original" % shape)

        ax = axes[k + 4]
        ax.scatter(T_i[:, 0], T_i[:, 1], color='red', edgecolor='none')
        ax.set_xlabel("X")
        ax.set_ylabel("Y")
        ax.set_xlim(ax.get_xlim())
        ax.set_ylim(ax.get_ylim())
        ax.set_title("%s simulated" % shape)

        k += 1

    print "Done."
    return fig
コード例 #7
0
def get_engine():
    cctypes, distargs = cu.parse_distargs(
        ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises'])
    T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]],
                                  cctypes,
                                  distargs, [.95] * len(cctypes),
                                  rng=gu.gen_rng(0))
    T = T.T
    # Make some nan cells for evidence.
    T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan
    T[8, 4] = np.nan
    engine = Engine(T,
                    cctypes=cctypes,
                    distargs=distargs,
                    num_states=6,
                    rng=gu.gen_rng(0))
    engine.transition(N=2)
    return engine
コード例 #8
0
def generate_gpmcc_posteriors(cctype, distargs, D_train, iters, seconds):
    """Learns gpmcc on D_train for seconds and simulates NUM_TEST times."""
    # Learning and posterior simulation.
    engine = Engine(D_train,
                    cctypes=[cctype],
                    distargs=[distargs],
                    num_states=64,
                    rng=gu.gen_rng(1))
    engine.transition(N=iters, S=seconds, progress=0)
    if iters:
        kernel = 'column_params' if cu.cctype_class(cctype).is_conditional()\
            else 'column_hypers'
        engine.transition(N=100, kernels=[kernel], progress=0)
    samples = engine.simulate(-1, [0], N=NUM_TEST)
    marginals = engine.logpdf_score()
    ranking = np.argsort(marginals)[::-1]
    for r in ranking[:5]:
        engine.get_state(r).plot()
    return [samples[i] for i in ranking[:5]]
コード例 #9
0
ファイル: test_bernoulli.py プロジェクト: wilsondy/cgpm
def test_bernoulli():
    # Switch for multiprocess (0 is faster).
    multiprocess = 0

    # Create categorical data of DATA_NUM_0 zeros and DATA_NUM_1 ones.
    data = np.transpose(np.array([[0] * DATA_NUM_0 + [1] * DATA_NUM_1]))

    # Run a single chain for a few iterations.
    engine = Engine(data,
                    cctypes=['categorical'],
                    distargs=[{
                        'k': 2
                    }],
                    rng=gu.gen_rng(0),
                    multiprocess=0)
    engine.transition(NUM_ITER, multiprocess=multiprocess)

    # Simulate from hypothetical row and compute the proportion of ones.
    sample = engine.simulate(-1, [0], N=NUM_SIM, multiprocess=multiprocess)[0]
    sum_b = sum(s[0] for s in sample)
    observed_prob_of_1 = (float(sum_b) / float(NUM_SIM))
    true_prob_of_1 = float(DATA_NUM_1) / float(DATA_NUM_0 + DATA_NUM_1)
    # Check 1% relative match.
    assert np.allclose(true_prob_of_1, observed_prob_of_1, rtol=.1)

    # Simulate from observed row as a crash test.
    sample = engine.simulate(1, [0], N=1, multiprocess=multiprocess)

    # Ensure normalized unobserved probabilities.
    p0_uob = engine.logpdf(-1, {0: 0}, multiprocess=multiprocess)[0]
    p1_uob = engine.logpdf(-1, {0: 1}, multiprocess=multiprocess)[0]
    assert np.allclose(gu.logsumexp([p0_uob, p1_uob]), 0)

    # A logpdf query constraining an observed returns an error.
    with pytest.raises(ValueError):
        engine.logpdf(1, {0: 0}, multiprocess=multiprocess)
    with pytest.raises(ValueError):
        engine.logpdf(1, {0: 1}, multiprocess=multiprocess)
コード例 #10
0
ファイル: test_cmi.py プロジェクト: wilsondy/cgpm
def test_entropy_bernoulli_univariate__ci_():
    rng = gen_rng(10)

    # Generate a univariate Bernoulli dataset.
    T = rng.choice([0,1], p=[.3,.7], size=250).reshape(-1,1)

    engine = Engine(T, cctypes=['bernoulli'], rng=rng, num_states=16)
    engine.transition(S=15)

    # exact computation.
    entropy_exact = - (.3*np.log(.3) + .7*np.log(.7))

    # logpdf computation.
    logps = engine.logpdf_bulk([-1,-1], [{0:0}, {0:1}])
    entropy_logpdf = [-np.sum(np.exp(logp)*logp) for logp in logps]

    # mutual_information computation.
    entropy_mi = engine.mutual_information([0], [0], N=1000)

    # Punt CLT analysis and go for 1 dp.
    assert np.allclose(entropy_exact, entropy_logpdf, atol=.1)
    assert np.allclose(entropy_exact, entropy_mi, atol=.1)
    assert np.allclose(entropy_logpdf, entropy_mi, atol=.05)
コード例 #11
0
def test_two_views_row_partition_normal__ci_(lovecat):
    D = retrieve_normal_dataset()

    engine = Engine(D.T,
                    cctypes=['normal'] * len(D),
                    Zv={
                        0: 0,
                        1: 0,
                        2: 0,
                        3: 1,
                        4: 1,
                        5: 1
                    },
                    rng=gu.gen_rng(12),
                    num_states=64)

    if lovecat:
        engine.transition_lovecat(N=100,
                                  kernels=[
                                      'row_partition_assignments',
                                      'row_partition_hyperparameters',
                                      'column_hyperparameters',
                                  ])
    else:
        engine.transition(N=100,
                          kernels=[
                              'view_alphas',
                              'rows',
                              'column_hypers',
                          ])

    R1 = engine.row_similarity_pairwise(cols=[0, 1, 2])
    R2 = engine.row_similarity_pairwise(cols=[3, 4, 5])

    pu.plot_clustermap(R1)
    pu.plot_clustermap(R2)
    return engine
コード例 #12
0
def test_two_views_column_partition_bernoulli__ci_(lovecat):
    D = retrieve_bernoulli_dataset()

    engine = Engine(D.T,
                    cctypes=['categorical'] * len(D),
                    distargs=[{
                        'k': 2
                    }] * len(D),
                    rng=gu.gen_rng(12),
                    num_states=64)
    if lovecat:
        engine.transition_lovecat(N=200)
    else:
        # engine = Engine(
        #     D.T,
        #     cctypes=['bernoulli']*len(D),
        #     rng=gu.gen_rng(12),
        #     num_states=64)
        engine.transition(N=200)

    P = engine.dependence_probability_pairwise()
    R1 = engine.row_similarity_pairwise(cols=[0, 1])
    R2 = engine.row_similarity_pairwise(cols=[2, 3])

    pu.plot_clustermap(P)
    pu.plot_clustermap(R1)
    pu.plot_clustermap(R2)

    P_THEORY = [
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
    ]
    return engine
コード例 #13
0
def test_incorporate_engine():
    engine = Engine(
        T[:,:2],
        cctypes=CCTYPES[:2],
        distargs=DISTARGS[:2],
        num_states=4,
        rng=gu.gen_rng(0),
    )
    engine.transition(N=5)

    # Incorporate a new dim into with a non-contiguous output.
    engine.incorporate_dim(
        T[:,2],
        outputs=[10],
        cctype=CCTYPES[2],
        distargs=DISTARGS[2]
    )
    engine.transition(N=2)

    # Serialize the engine, and run a targeted transtion on variable 10.
    m = engine.to_metadata()
    engine2 = Engine.from_metadata(m)
    engine2.transition(N=2, cols=[10], multiprocess=0)
    assert all(s.outputs == [0,1,10] for s in engine.states)
コード例 #14
0
N_ROWS = 300
N_STATES = 12
N_ITERS = 100

cctypes = ['categorical(k={})'.format(N_ROWS)] + ['normal']*8
cctypes, distargs = cu.parse_distargs(cctypes)
column_names = ['id'] + ['one cluster']*4 + ['four cluster']*4

# id column.
X = np.zeros((N_ROWS, 9))
X[:,0] = np.arange(N_ROWS)

# Four columns of one cluster from the standard normal.
X[:,1:5] = np.random.randn(N_ROWS, 4)

# Four columns of four clusters with unit variance and means \in {0,1,2,3}.
Z = np.random.randint(4, size=(N_ROWS))
X[:,5:] = 4*np.reshape(np.repeat(Z,4), (len(Z),4)) + np.random.randn(N_ROWS, 4)

# Inference.
engine = Engine(
    X, cctypes=cctypes, distargs=distargs, num_states=N_STATES)
engine.transition(N=N_ITERS)

# Dependence probability.
D = engine.dependence_probability_pairwise()
zmat = sns.clustermap(D, yticklabels=column_names, xticklabels=column_names)
plt.setp(zmat.ax_heatmap.get_yticklabels(), rotation=0)
plt.setp(zmat.ax_heatmap.get_xticklabels(), rotation=90)
plt.show()