def _test_convergence_bb_cxx(N,
                             D,
                             kernel,
                             preprocess_data_fn=None,
                             nonconj=False,
                             burnin_niters=10000,
                             skip=10,
                             ntries=50,
                             nsamples=1000,
                             kl_places=2):
    r = rng()
    cluster_hp = {'alpha': 2.0}
    feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D
    defn = model_definition(N, [bb] * D)
    nonconj_defn = model_definition(N, [bbnc] * D)
    Y, posterior = data_with_posterior(
        defn, cluster_hp, feature_hps, preprocess_data_fn)
    data = numpy_dataview(Y)
    s = initialize(nonconj_defn if nonconj else defn,
                   data,
                   cluster_hp=cluster_hp,
                   feature_hps=feature_hps,
                   r=r)
    bs = bind(s, data)
    wrapped_kernel = lambda s: kernel(s, r)
    _test_convergence(bs,
                      posterior,
                      wrapped_kernel,
                      burnin_niters,
                      skip,
                      ntries,
                      nsamples,
                      kl_places)
def _test_convergence_bb_cxx(N,
                             D,
                             kernel,
                             preprocess_data_fn=None,
                             nonconj=False,
                             burnin_niters=10000,
                             skip=10,
                             ntries=50,
                             nsamples=1000,
                             kl_places=2):
    r = rng()
    cluster_hp = {'alpha': 2.0}
    feature_hps = [{'alpha': 1.0, 'beta': 1.0}] * D
    defn = model_definition(N, [bb] * D)
    nonconj_defn = model_definition(N, [bbnc] * D)
    Y, posterior = data_with_posterior(defn, cluster_hp, feature_hps,
                                       preprocess_data_fn)
    data = numpy_dataview(Y)
    s = initialize(nonconj_defn if nonconj else defn,
                   data,
                   cluster_hp=cluster_hp,
                   feature_hps=feature_hps,
                   r=r)
    bs = bind(s, data)
    wrapped_kernel = lambda s: kernel(s, r)
    _test_convergence(bs, posterior, wrapped_kernel, burnin_niters, skip,
                      ntries, nsamples, kl_places)
Ejemplo n.º 3
0
def test_get_set_params():
    defn = model_definition(1, [bb, bnb, gp, nich])
    data = np.array([
        (True, 3, 5, 10.),
    ],
                    dtype=[('', bool), ('', int), ('', int), ('', float)])
    s = initialize(defn=defn, data=numpy_dataview(data), r=rng())
    s.set_cluster_hp({'alpha': 3.0})
    assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0})
    hyperparams = [
        {
            'alpha': 1.2,
            'beta': 4.3
        },
        {
            'alpha': 1.,
            'beta': 1.,
            'r': 1
        },
        {
            'alpha': 1.,
            'inv_beta': 1.
        },
        {
            'mu': 30.,
            'kappa': 1.,
            'sigmasq': 1.,
            'nu': 1.
        },
    ]
    for i, hp in enumerate(hyperparams):
        s.set_feature_hp(i, hp)
        assert_dict_almost_equals(s.get_feature_hp(i), hp)
Ejemplo n.º 4
0
def _make_one_feature_bb_mm(initialize_fn, dataview, Nk, K, alpha, beta, r):
    # XXX: the rng parameter passed does not get threaded through the
    # random *data* generation
    # use the py_bb for sampling
    py_bb = bb.py_desc()._model_module
    shared = py_bb.Shared()
    shared.load({'alpha': alpha, 'beta': beta})

    def init_sampler():
        samp = py_bb.Sampler()
        samp.init(shared)
        return samp
    samplers = [init_sampler() for _ in xrange(K)]

    def gen_cluster(samp):
        data = [(samp.eval(shared),) for _ in xrange(Nk)]
        return np.array(data, dtype=[('', bool)])
    Y_clustered = tuple(map(gen_cluster, samplers))
    Y, assignment = data_with_assignment(Y_clustered)
    view = dataview(Y)
    s = initialize_fn(model_definition(Y.shape[0], [bb]),
                      view,
                      cluster_hp={'alpha': 2.},
                      feature_hps=[{'alpha': alpha, 'beta': beta}],
                      r=r,
                      assignment=assignment)
    return s, view
Ejemplo n.º 5
0
def test_runner_multiprocessing_convergence():
    N, D = 4, 5
    defn = model_definition(N, [bb] * D)
    prng = rng()
    Y, posterior = data_with_posterior(defn, r=prng)
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng)
               for _ in xrange(mp.cpu_count())]
    runners = [runner.runner(defn, view, latent, ['assign'])
               for latent in latents]
    r = parallel.runner(runners)
    r.run(r=prng, niters=1000)  # burnin
    idmap = {C: i for i, C in enumerate(permutation_iter(N))}

    def sample_iter():
        r.run(r=prng, niters=10)
        for latent in r.get_latents():
            yield idmap[tuple(permutation_canonical(latent.assignments()))]

    ref = [None]

    def sample_fn():
        if ref[0] is None:
            ref[0] = sample_iter()
        try:
            return next(ref[0])
        except StopIteration:
            ref[0] = None
        return sample_fn()

    assert_discrete_dist_approx(sample_fn, posterior, ntries=100, kl_places=2)
Ejemplo n.º 6
0
def test_posterior_predictive_statistic():
    N, D = 10, 4  # D needs to be even
    defn = model_definition(N, [bb] * D)
    Y = toy_dataset(defn)
    prng = rng()
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
    q = ma.masked_array(
        np.array([(False,) * D], dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)])

    statistic = query.posterior_predictive_statistic(q, latents, prng)
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    statistic = query.posterior_predictive_statistic(
        q, latents, prng, merge='mode')
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    statistic = query.posterior_predictive_statistic(
        q, latents, prng, merge=['mode', 'mode', 'avg', 'avg'])
    assert_equals(statistic.shape, (1,))
    assert_equals(len(statistic.dtype), D)

    q = ma.masked_array(
        np.array([(False,) * D] * 3, dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3)
    statistic = query.posterior_predictive_statistic(q, latents, prng)
    assert_equals(statistic.shape, (3,))
    assert_equals(len(statistic.dtype), D)
Ejemplo n.º 7
0
def test_slice_theta_mm():
    N = 100
    data = np.array(
        [(np.random.random() < 0.8,) for _ in xrange(N)],
        dtype=[('', bool)])
    defn = model_definition(N, [bbnc])
    r = rng()
    prior = {'alpha': 1.0, 'beta': 9.0}
    view = numpy_dataview(data)
    s = initialize(
        defn,
        view,
        cluster_hp={'alpha': 1., 'beta': 9.},
        feature_hps=[prior],
        r=r,
        assignment=[0] * N)

    heads = len([1 for y in data if y[0]])
    tails = N - heads

    alpha1 = prior['alpha'] + heads
    beta1 = prior['beta'] + tails

    bs = bind(s, view)
    params = {0: {'p': 0.05}}

    def sample_fn():
        theta(bs, r, tparams=params)
        return s.get_suffstats(0, 0)['p']

    rv = beta(alpha1, beta1)
    assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
Ejemplo n.º 8
0
def _test_stress(initialize_fn, dataview, R):
    N = 20
    D = 2
    data = np.random.random(size=(N, D)) < 0.8
    Y = np.array([tuple(y) for y in data], dtype=[('', bool)] * D)
    view = dataview(Y)
    defn = model_definition(N, [bb] * D)

    s = initialize_fn(defn, view, cluster_hp={'alpha': 2.0}, r=R)

    CHANGE_GROUP = 1
    CHANGE_VALUE = 2

    nops = 100
    while nops:
        assert len(s.groups()) >= 1
        choice = np.random.choice([CHANGE_GROUP, CHANGE_VALUE])
        if choice == CHANGE_GROUP:
            # remove any empty groups. otherwise, add a new group
            egroups = s.empty_groups()
            if len(egroups) > 1:
                s.delete_group(egroups[0])
            else:
                s.create_group(R)
        else:
            eid = np.random.randint(N)
            if s.assignments()[eid] == -1:
                # add to random group
                egid = np.random.choice(s.groups())
                s.add_value(egid, eid, Y[eid], R)
            else:
                s.remove_value(eid, Y[eid], R)
        s.dcheck_consistency()
        nops -= 1
Ejemplo n.º 9
0
def test_crp_empirical():
    N = 4
    alpha = 2.5
    defn = model_definition(N, [bb])
    Y = np.array([(True, )] * N, dtype=[('', bool)])
    view = numpy_dataview(Y)
    r = rng()

    def crp_score(assignment):
        latent = initialize(defn,
                            view,
                            r=r,
                            cluster_hp={'alpha': alpha},
                            assignment=assignment)
        return latent.score_assignment()

    scores = np.array(list(map(crp_score, permutation_iter(N))))
    dist = scores_to_probs(scores)
    idmap = {C: i for i, C in enumerate(permutation_iter(N))}

    def sample_fn():
        sample = permutation_canonical(_sample_crp(N, alpha))
        return idmap[tuple(sample)]

    assert_discrete_dist_approx(sample_fn, dist, ntries=100)
Ejemplo n.º 10
0
def test_slice_theta_mm():
    N = 100
    data = np.array([(np.random.random() < 0.8, ) for _ in xrange(N)],
                    dtype=[('', bool)])
    defn = model_definition(N, [bbnc])
    r = rng()
    prior = {'alpha': 1.0, 'beta': 9.0}
    view = numpy_dataview(data)
    s = initialize(defn,
                   view,
                   cluster_hp={
                       'alpha': 1.,
                       'beta': 9.
                   },
                   feature_hps=[prior],
                   r=r,
                   assignment=[0] * N)

    heads = len([1 for y in data if y[0]])
    tails = N - heads

    alpha1 = prior['alpha'] + heads
    beta1 = prior['beta'] + tails

    bs = bind(s, view)
    params = {0: {'p': 0.05}}

    def sample_fn():
        theta(bs, r, tparams=params)
        return s.get_suffstats(0, 0)['p']

    rv = beta(alpha1, beta1)
    assert_1d_cont_dist_approx_sps(sample_fn, rv, nsamples=50000)
Ejemplo n.º 11
0
def _make_one_feature_bb_mm(initialize_fn, dataview, Nk, K, alpha, beta, r):
    # XXX: the rng parameter passed does not get threaded through the
    # random *data* generation
    # use the py_bb for sampling
    py_bb = bb.py_desc()._model_module
    shared = py_bb.Shared()
    shared.load({'alpha': alpha, 'beta': beta})

    def init_sampler():
        samp = py_bb.Sampler()
        samp.init(shared)
        return samp

    samplers = [init_sampler() for _ in xrange(K)]

    def gen_cluster(samp):
        data = [(samp.eval(shared), ) for _ in xrange(Nk)]
        return np.array(data, dtype=[('', bool)])

    Y_clustered = tuple(map(gen_cluster, samplers))
    Y, assignment = data_with_assignment(Y_clustered)
    view = dataview(Y)
    s = initialize_fn(model_definition(Y.shape[0], [bb]),
                      view,
                      cluster_hp={'alpha': 2.},
                      feature_hps=[{
                          'alpha': alpha,
                          'beta': beta
                      }],
                      r=r,
                      assignment=assignment)
    return s, view
Ejemplo n.º 12
0
def _test_stress(initialize_fn, dataview, R):
    N = 20
    D = 2
    data = np.random.random(size=(N, D)) < 0.8
    Y = np.array([tuple(y) for y in data], dtype=[('', bool)] * D)
    view = dataview(Y)
    defn = model_definition(N, [bb] * D)

    s = initialize_fn(defn, view, cluster_hp={'alpha': 2.0}, r=R)

    CHANGE_GROUP = 1
    CHANGE_VALUE = 2

    nops = 100
    while nops:
        assert len(s.groups()) >= 1
        choice = np.random.choice([CHANGE_GROUP, CHANGE_VALUE])
        if choice == CHANGE_GROUP:
            # remove any empty groups. otherwise, add a new group
            egroups = s.empty_groups()
            if len(egroups) > 1:
                s.delete_group(egroups[0])
            else:
                s.create_group(R)
        else:
            eid = np.random.randint(N)
            if s.assignments()[eid] == -1:
                # add to random group
                egid = np.random.choice(s.groups())
                s.add_value(egid, eid, Y[eid], R)
            else:
                s.remove_value(eid, Y[eid], R)
        s.dcheck_consistency()
        nops -= 1
Ejemplo n.º 13
0
def test_operations():
    N = 10
    R = rng(12)

    def mkrow():
        return (np.random.choice([False,
                                  True]), np.random.choice([False, True]),
                np.random.random(), np.random.choice([False, True]))

    dtype = [('', bool), ('', bool), ('', float), ('', bool)]
    # non-masked data
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bb, nich, bb])
    init_args = {
        'defn':
        defn,
        'cluster_hp': {
            'alpha': 2.0
        },
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
        ],
        'r':
        R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # *_initialize() randomly assigns all entities to a group, so we'll have to
    # unset this assignment for this test
    unset(cxx_s, data, R)

    ensure_k_groups(cxx_s, 3, R)

    assert cxx_s.nentities() == N

    cxx_s.dcheck_consistency()

    assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in it.islice(enumerate(data), 2):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()

    newrow = mkrow()
    newdata = np.array([newrow], dtype=dtype)

    cxx_score = cxx_s.score_value(newdata[0], R)
    assert cxx_score is not None
    cxx_s.dcheck_consistency()
Ejemplo n.º 14
0
def test_model_definition_pickle():
    defn = model_definition(10, [bb, niw(3)])
    bstr = pickle.dumps(defn)
    defn1 = pickle.loads(bstr)
    assert_equals(defn.n(), defn1.n())
    assert_equals(len(defn.models()), len(defn1.models()))
    for a, b in zip(defn.models(), defn1.models()):
        assert_equals(a.name(), b.name())
Ejemplo n.º 15
0
def test_model_definition_copy():
    defn = model_definition(10, [bb, niw(3)])
    defn_shallow = copy.copy(defn)
    defn_deep = copy.deepcopy(defn)
    assert_is_not(defn, defn_shallow)
    assert_is_not(defn, defn_deep)
    assert_is_not(defn._models, defn_deep._models)
    assert_equals(defn.n(), defn_shallow.n())
    assert_equals(defn.n(), defn_deep.n())
Ejemplo n.º 16
0
def test_sample_sanity():
    # just a sanity check
    defn = model_definition(10, [bb, bnb, gp, nich, dd(5), niw(4)])
    clusters, samplers = sample(defn)
    assert_equals(len(clusters), len(samplers))
    for cluster in clusters:
        assert_true(len(cluster) > 0)
        for v in cluster:
            assert_equals(len(v), len(defn.models()))
Ejemplo n.º 17
0
def test_zmatrix():
    N, D = 10, 4
    defn = model_definition(N, [bb] * D)
    Y = toy_dataset(defn)
    prng = rng()
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng) for _ in xrange(10)]
    zmat = query.zmatrix(latents)
    assert_equals(zmat.shape, (N, N))
Ejemplo n.º 18
0
def test_sample_post_pred():
    N = 10
    R = rng(5483932)
    D = 4

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return tuple(randombool() for _ in xrange(D))

    dtype = [('', bool)] * D
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb] * D)
    init_args = {
        'defn': defn,
        'cluster_hp': {
            'alpha': 2.0
        },
        'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    G = 3
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % G
        cxx_s.add_value(egid, i, yi, R)

    # sample
    y_new_data = mkrow()
    y_new_mask = tuple(randombool() for _ in xrange(D))
    y_new = ma.masked_array(np.array([y_new_data], dtype=dtype),
                            mask=[y_new_mask])[0]

    n_samples = 1000

    cxx_samples = np.hstack(
        [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])

    idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}

    def todist(samples):
        dist = np.zeros(len(idmap))
        for s in samples:
            dist[idmap[tuple(s)]] += 1.0
        dist /= dist.sum()
        return dist

    cxx_dist = todist(cxx_samples)
    assert cxx_dist is not None
Ejemplo n.º 19
0
def _test_cluster_hp_inference(initialize_fn,
                               prior_fn,
                               grid_min,
                               grid_max,
                               grid_n,
                               dataview,
                               bind_fn,
                               init_inf_kernel_state_fn,
                               inf_kernel_fn,
                               map_actual_postprocess_fn,
                               prng,
                               burnin=1000,
                               nsamples=1000,
                               skip=10,
                               trials=100,
                               places=2):
    print '_test_cluster_hp_inference: burnin', burnin, 'nsamples', nsamples, \
        'skip', skip, 'trials', trials, 'places', places

    N = 1000
    D = 5

    # create random binary data, doesn't really matter what the values are
    Y = np.random.random(size=(N, D)) < 0.5
    Y = np.array([tuple(y) for y in Y], dtype=[('', np.bool)] * D)
    view = dataview(Y)

    defn = model_definition(N, [bb] * D)
    latent = initialize_fn(defn, view, r=prng)
    model = bind_fn(latent, view)

    def score_alpha(alpha):
        prev_alpha = latent.get_cluster_hp()['alpha']
        latent.set_cluster_hp({'alpha': alpha})
        score = prior_fn(alpha) + latent.score_assignment()
        latent.set_cluster_hp({'alpha': prev_alpha})
        return score

    def sample_fn():
        for _ in xrange(skip - 1):
            inf_kernel_fn(model, opaque, prng)
        inf_kernel_fn(model, opaque, prng)
        return latent.get_cluster_hp()['alpha']

    alpha0 = np.random.uniform(grid_min, grid_max)
    print 'start alpha:', alpha0
    latent.set_cluster_hp({'alpha': alpha0})

    opaque = init_inf_kernel_state_fn(latent)
    for _ in xrange(burnin):
        inf_kernel_fn(model, opaque, prng)
    print 'finished burnin of', burnin, 'iterations'

    print 'grid_min', grid_min, 'grid_max', grid_max
    assert_1d_cont_dist_approx_emp(sample_fn, score_alpha, grid_min, grid_max,
                                   grid_n, trials, nsamples, places)
Ejemplo n.º 20
0
def test_operations():
    N = 10
    R = rng(12)

    def mkrow():
        return (np.random.choice([False, True]),
                np.random.choice([False, True]),
                np.random.random(),
                np.random.choice([False, True]))
    dtype = [('', bool), ('', bool), ('', float), ('', bool)]
    # non-masked data
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bb, nich, bb])
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # *_initialize() randomly assigns all entities to a group, so we'll have to
    # unset this assignment for this test
    unset(cxx_s, data, R)

    ensure_k_groups(cxx_s, 3, R)

    assert cxx_s.nentities() == N

    cxx_s.dcheck_consistency()

    assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in it.islice(enumerate(data), 2):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()

    newrow = mkrow()
    newdata = np.array([newrow], dtype=dtype)

    cxx_score = cxx_s.score_value(newdata[0], R)
    assert cxx_score is not None
    cxx_s.dcheck_consistency()
Ejemplo n.º 21
0
def _test_sample_post_pred(initialize_fn, dataview, y_new, r):
    defn = model_definition(N, [bb] * D)

    data = [tuple(row) for row in (np.random.random(size=(N, D)) < 0.8)]
    data = np.array(data, dtype=[('', bool)] * D)

    s = initialize_fn(defn=defn,
                      data=dataview(data),
                      cluster_hp={'alpha': 2.},
                      feature_hps=[{
                          'alpha': 1.,
                          'beta': 1.
                      }] * D,
                      r=r)

    n_samples = 10000
    Y_samples = [s.sample_post_pred(None, r)[1] for _ in xrange(n_samples)]
    Y_samples = np.hstack(Y_samples)

    empty_groups = list(s.empty_groups())
    if len(empty_groups):
        for egid in empty_groups[1:]:
            s.delete_group(egid)
    else:
        s.create_group(r)
    assert len(s.empty_groups()) == 1

    def score_post_pred(y):
        # XXX: the C++ API can only handle structural arrays for now
        y = np.array([y], dtype=[('', bool)] * D)[0]
        _, scores = s.score_value(y, r)
        return logsumexp(scores)

    scores = np.array(
        list(map(score_post_pred, it.product([False, True], repeat=D))))
    scores = np.exp(scores)
    assert_almost_equals(scores.sum(), 1.0, places=3)

    # lazy man
    idmap = {y: i for i, y in enumerate(it.product([False, True], repeat=D))}

    smoothing = 1e-5
    sample_hist = np.zeros(len(idmap), dtype=np.int)
    for y in Y_samples:
        sample_hist[idmap[tuple(y)]] += 1.

    sample_hist = np.array(sample_hist, dtype=np.float) + smoothing
    sample_hist /= sample_hist.sum()

    #print 'actual', scores
    #print 'emp', sample_hist
    kldiv = KL_discrete(scores, sample_hist)
    print 'KL:', kldiv

    assert kldiv <= 0.005
Ejemplo n.º 22
0
def test_sample_post_pred():
    N = 10
    R = rng(5483932)
    D = 4

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return tuple(randombool() for _ in xrange(D))
    dtype = [('', bool)] * D
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb] * D)
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    G = 3
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % G
        cxx_s.add_value(egid, i, yi, R)

    # sample
    y_new_data = mkrow()
    y_new_mask = tuple(randombool() for _ in xrange(D))
    y_new = ma.masked_array(
        np.array([y_new_data], dtype=dtype),
        mask=[y_new_mask])[0]

    n_samples = 1000

    cxx_samples = np.hstack(
        [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])

    idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}

    def todist(samples):
        dist = np.zeros(len(idmap))
        for s in samples:
            dist[idmap[tuple(s)]] += 1.0
        dist /= dist.sum()
        return dist

    cxx_dist = todist(cxx_samples)
    assert cxx_dist is not None
Ejemplo n.º 23
0
def test_runner_multyvac():
    defn = model_definition(10, [bb, nich, niw(3)])
    Y = toy_dataset(defn)
    view = numpy_dataview(Y)
    kc = runner.default_kernel_config(defn)
    prng = rng()
    latents = [model.initialize(defn, view, prng)
               for _ in xrange(2)]
    runners = [runner.runner(defn, view, latent, kc) for latent in latents]
    r = parallel.runner(runners, backend='multyvac', layer='perf', core='f2')
    r.run(r=prng, niters=1000)
    r.run(r=prng, niters=1000)
Ejemplo n.º 24
0
def _test_sample_post_pred(initialize_fn, dataview, y_new, r):
    defn = model_definition(N, [bb] * D)

    data = [tuple(row) for row in (np.random.random(size=(N, D)) < 0.8)]
    data = np.array(data, dtype=[('', bool)] * D)

    s = initialize_fn(
        defn=defn,
        data=dataview(data),
        cluster_hp={'alpha': 2.},
        feature_hps=[{'alpha': 1., 'beta': 1.}] * D,
        r=r)

    n_samples = 10000
    Y_samples = [s.sample_post_pred(None, r)[1] for _ in xrange(n_samples)]
    Y_samples = np.hstack(Y_samples)

    empty_groups = list(s.empty_groups())
    if len(empty_groups):
        for egid in empty_groups[1:]:
            s.delete_group(egid)
    else:
        s.create_group(r)
    assert len(s.empty_groups()) == 1

    def score_post_pred(y):
        # XXX: the C++ API can only handle structural arrays for now
        y = np.array([y], dtype=[('', bool)] * D)[0]
        _, scores = s.score_value(y, r)
        return logsumexp(scores)

    scores = np.array(
        list(map(score_post_pred, it.product([False, True], repeat=D))))
    scores = np.exp(scores)
    assert_almost_equals(scores.sum(), 1.0, places=3)

    # lazy man
    idmap = {y: i for i, y in enumerate(it.product([False, True], repeat=D))}

    smoothing = 1e-5
    sample_hist = np.zeros(len(idmap), dtype=np.int)
    for y in Y_samples:
        sample_hist[idmap[tuple(y)]] += 1.

    sample_hist = np.array(sample_hist, dtype=np.float) + smoothing
    sample_hist /= sample_hist.sum()

    #print 'actual', scores
    #print 'emp', sample_hist
    kldiv = KL_discrete(scores, sample_hist)
    print 'KL:', kldiv

    assert kldiv <= 0.005
Ejemplo n.º 25
0
def _test_scalar_hp_inference(view,
                              prior_fn,
                              w,
                              grid_min,
                              grid_max,
                              grid_n,
                              likelihood_model,
                              scalar_hp_key,
                              burnin=1000,
                              nsamples=1000,
                              every=10,
                              trials=100,
                              places=2):
    """
    view must be 1D
    """
    r = rng()

    hparams = {0: {scalar_hp_key: (prior_fn, w)}}

    def score_fn(scalar):
        d = latent.get_feature_hp(0)
        prev_scalar = d[scalar_hp_key]
        d[scalar_hp_key] = scalar
        latent.set_feature_hp(0, d)
        score = prior_fn(scalar) + latent.score_data(0, None, r)
        d[scalar_hp_key] = prev_scalar
        latent.set_feature_hp(0, d)
        return score

    defn = model_definition(len(view), [likelihood_model])
    latent = initialize(defn, view, r=r)
    model = bind(latent, view)

    def sample_fn():
        for _ in xrange(every):
            slice_hp(model, r, hparams=hparams)
        return latent.get_feature_hp(0)[scalar_hp_key]

    for _ in xrange(burnin):
        slice_hp(model, r, hparams=hparams)
    print 'finished burnin of', burnin, 'iterations'

    print 'grid_min', grid_min, 'grid_max', grid_max
    assert_1d_cont_dist_approx_emp(sample_fn,
                                   score_fn,
                                   grid_min,
                                   grid_max,
                                   grid_n,
                                   trials,
                                   nsamples,
                                   places)
Ejemplo n.º 26
0
def test_runner_multiprocessing():
    defn = model_definition(10, [bb, nich, niw(3)])
    Y = toy_dataset(defn)
    view = numpy_dataview(Y)
    kc = runner.default_kernel_config(defn)
    prng = rng()
    latents = [model.initialize(defn, view, prng)
               for _ in xrange(mp.cpu_count())]
    runners = [runner.runner(defn, view, latent, kc) for latent in latents]
    r = parallel.runner(runners)
    # check it is restartable
    r.run(r=prng, niters=10)
    r.run(r=prng, niters=10)
Ejemplo n.º 27
0
def run_dpgmm(niter=1000, datadir="../../", nfeatures=13):

    ranking = [10,  6,  7, 26,  5,  8,  4, 19, 12, 23, 24, 33, 28, 25,
               14,  3,  0, 1, 21, 30, 11, 31, 13,  9, 22,  2, 27, 29,
               32, 17, 18, 20, 16, 15]

    features, labels, lc, hr, tstart, \
        features_lb, labels_lb, lc_lb, hr_lb, \
        fscaled, fscaled_lb, fscaled_full, labels_all = \
            load_data(datadir, tseg=1024.0, log_features=None,
                      ranking=ranking)

    labels_phys = feature_engineering.convert_labels_to_physical(labels)
    labels_phys_lb = feature_engineering.convert_labels_to_physical(labels_lb)

    labels_all_phys = np.hstack([labels_phys["train"], labels_phys["val"],
                                 labels_phys["test"]])


    fscaled_small = fscaled_full[:, :13]

    nchains = 8

    # The random state object
    prng = rng()

    # Define a DP-GMM where the Gaussian is 2D
    defn = model_definition(fscaled_small.shape[0],
                            [normal_inverse_wishart(fscaled_small.shape[1])])

    fscaled_rec = np.array([(list(f),) for f in fscaled_small],
                           dtype=[('', np.float32, fscaled_small.shape[1])])

    # Create a wrapper around the numpy recarray which
    # data-microscopes understands
    view = numpy_dataview(fscaled_rec)

    # Initialize nchains start points randomly in the state space
    latents = [model.initialize(defn, view, prng) for _ in xrange(nchains)]

    # Create a runner for each chain
    runners = [runner.runner(defn, view, latent,
                             kernel_config=['assign']) for latent in latents]
    r = parallel.runner(runners)

    r.run(r=prng, niters=niter)

    with open(datadir+"grs1915_dpgmm.pkl", "w") as f:
        pickle.dump(r, f)

    return
Ejemplo n.º 28
0
 def score_dataset(counts):
     M, K = counts.shape
     Y = np.array([(y, ) for y in counts], dtype=[('', np.int, (K, ))])
     view = cxx_numpy_dataview(Y)
     r = rng()
     defn = model_definition(M, [dm(K)])
     prior = {'alphas': [1.] * K}
     s = cxx_initialize(defn,
                        view,
                        r,
                        feature_hps=[prior],
                        assignment=[0] * M)
     assert_equals(s.groups(), [0])
     return s.score_data(None, None, r)
Ejemplo n.º 29
0
def _test_crp(initialize_fn, dataview, alpha, r):
    N = 6
    defn = model_definition(N, [bb])
    Y = np.array([(True,)] * N, dtype=[('', bool)])
    view = dataview(Y)

    def crp_score(assignment):
        latent = initialize_fn(
            defn, view, r=r,
            cluster_hp={'alpha': alpha}, assignment=assignment)
        return latent.score_assignment()
    dist = np.array(list(map(crp_score, permutation_iter(N))))
    dist = np.exp(dist)
    assert_almost_equals(dist.sum(), 1.0, places=3)
Ejemplo n.º 30
0
def test_masked_operations():
    N = 10
    R = rng(2347785)

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())

    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)
    mask = [mkmask() for _ in xrange(N)]
    data = ma.masked_array(data, mask=mask)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn':
        defn,
        'cluster_hp': {
            'alpha': 10.0
        },
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r':
        R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # see comment above
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in enumerate(data):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()
Ejemplo n.º 31
0
 def score_dataset(counts):
     M, K = counts.shape
     Y = np.array([(y,) for y in counts], dtype=[('', np.int, (K,))])
     view = cxx_numpy_dataview(Y)
     r = rng()
     defn = model_definition(M, [dm(K)])
     prior = {'alphas': [1.] * K}
     s = cxx_initialize(
         defn,
         view,
         r,
         feature_hps=[prior],
         assignment=[0] * M)
     assert_equals(s.groups(), [0])
     return s.score_data(None, None, r)
def test_get_set_params():
    defn = model_definition(1, [bb, bnb, gp, nich])
    data = np.array([(True, 3, 5, 10.), ],
                    dtype=[('', bool), ('', int), ('', int), ('', float)])
    s = initialize(defn=defn, data=numpy_dataview(data), r=rng())
    s.set_cluster_hp({'alpha': 3.0})
    assert_dict_almost_equals(s.get_cluster_hp(), {'alpha': 3.0})
    hyperparams = [
        {'alpha': 1.2, 'beta': 4.3},
        {'alpha': 1., 'beta': 1., 'r': 1},
        {'alpha': 1., 'inv_beta': 1.},
        {'mu': 30., 'kappa': 1., 'sigmasq': 1., 'nu': 1.},
    ]
    for i, hp in enumerate(hyperparams):
        s.set_feature_hp(i, hp)
        assert_dict_almost_equals(s.get_feature_hp(i), hp)
Ejemplo n.º 33
0
def _test_crp(initialize_fn, dataview, alpha, r):
    N = 6
    defn = model_definition(N, [bb])
    Y = np.array([(True, )] * N, dtype=[('', bool)])
    view = dataview(Y)

    def crp_score(assignment):
        latent = initialize_fn(defn,
                               view,
                               r=r,
                               cluster_hp={'alpha': alpha},
                               assignment=assignment)
        return latent.score_assignment()

    dist = np.array(list(map(crp_score, permutation_iter(N))))
    dist = np.exp(dist)
    assert_almost_equals(dist.sum(), 1.0, places=3)
Ejemplo n.º 34
0
def latent(groups, entities_per_group, features, r):
    N = groups * entities_per_group
    defn = model_definition(N, [bb] * features)

    # generate fake data
    Y = np.random.random(size=(N, features)) <= 0.5
    view = numpy_dataview(
        np.array([tuple(y) for y in Y], dtype=[('', bool)] * features))

    # assign entities to their respective groups
    assignment = [[g] * entities_per_group for g in xrange(groups)]
    assignment = list(it.chain.from_iterable(assignment))

    latent = bind(initialize(defn, view, r, assignment=assignment), view)
    latent.create_group(r)  # perftest() doesnt modify group assignments

    return latent
Ejemplo n.º 35
0
def _test_scalar_hp_inference(view,
                              prior_fn,
                              w,
                              grid_min,
                              grid_max,
                              grid_n,
                              likelihood_model,
                              scalar_hp_key,
                              burnin=1000,
                              nsamples=1000,
                              every=10,
                              trials=100,
                              places=2):
    """
    view must be 1D
    """
    r = rng()

    hparams = {0: {scalar_hp_key: (prior_fn, w)}}

    def score_fn(scalar):
        d = latent.get_feature_hp(0)
        prev_scalar = d[scalar_hp_key]
        d[scalar_hp_key] = scalar
        latent.set_feature_hp(0, d)
        score = prior_fn(scalar) + latent.score_data(0, None, r)
        d[scalar_hp_key] = prev_scalar
        latent.set_feature_hp(0, d)
        return score

    defn = model_definition(len(view), [likelihood_model])
    latent = initialize(defn, view, r=r)
    model = bind(latent, view)

    def sample_fn():
        for _ in xrange(every):
            slice_hp(model, r, hparams=hparams)
        return latent.get_feature_hp(0)[scalar_hp_key]

    for _ in xrange(burnin):
        slice_hp(model, r, hparams=hparams)
    print 'finished burnin of', burnin, 'iterations'

    print 'grid_min', grid_min, 'grid_max', grid_max
    assert_1d_cont_dist_approx_emp(sample_fn, score_fn, grid_min, grid_max,
                                   grid_n, trials, nsamples, places)
Ejemplo n.º 36
0
def test_runner_convergence():
    N, D = 4, 5
    defn = model_definition(N, [bb] * D)
    prng = rng()
    Y, posterior = data_with_posterior(defn, r=prng)
    view = numpy_dataview(Y)
    latent = model.initialize(defn, view, prng)
    r = runner.runner(defn, view, latent, ['assign'])
    r.run(r=prng, niters=1000)  # burnin
    idmap = {C: i for i, C in enumerate(permutation_iter(N))}

    def sample_fn():
        r.run(r=prng, niters=10)
        new_latent = r.get_latent()
        return idmap[tuple(permutation_canonical(new_latent.assignments()))]

    assert_discrete_dist_approx(sample_fn, posterior, ntries=100)
Ejemplo n.º 37
0
def _test_serializer(initialize_fn, deserialize_fn, dataview):
    N = 10
    R = rng()

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())

    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn':
        defn,
        'data':
        dataview(data),
        'cluster_hp': {
            'alpha': 10.0
        },
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r':
        R,
    }
    state = initialize_fn(**init_args)

    raw = state.serialize()

    state1 = deserialize_fn(defn, raw)
    assert state1 is not None

    bstr = pickle.dumps(state)
    state2 = pickle.loads(bstr)
    assert state2 is not None
Ejemplo n.º 38
0
def test_masked_operations():
    N = 10
    R = rng(2347785)

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)
    mask = [mkmask() for _ in xrange(N)]
    data = ma.masked_array(data, mask=mask)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 10.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # see comment above
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in enumerate(data):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()
Ejemplo n.º 39
0
def test_posterior_predictive():
    N, D = 10, 4  # D needs to be even
    defn = model_definition(N, [bb] * D)
    Y = toy_dataset(defn)
    prng = rng()
    view = numpy_dataview(Y)
    latents = [model.initialize(defn, view, prng) for _ in xrange(10)]

    q = ma.masked_array(
        np.array([(False,) * D], dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)])
    samples = query.posterior_predictive(q, latents, prng)
    assert_equals(samples.shape, (1, len(latents)))

    q = ma.masked_array(
        np.array([(False,) * D] * 3, dtype=[('', bool)] * D),
        mask=[(False,) * (D / 2) + (True,) * (D / 2)] * 3)
    samples = query.posterior_predictive(q, latents, prng)
    assert_equals(samples.shape, (3, len(latents)))
Ejemplo n.º 40
0
def test_crp_empirical():
    N = 4
    alpha = 2.5
    defn = model_definition(N, [bb])
    Y = np.array([(True,)] * N, dtype=[('', bool)])
    view = numpy_dataview(Y)
    r = rng()

    def crp_score(assignment):
        latent = initialize(
            defn, view, r=r,
            cluster_hp={'alpha': alpha}, assignment=assignment)
        return latent.score_assignment()
    scores = np.array(list(map(crp_score, permutation_iter(N))))
    dist = scores_to_probs(scores)
    idmap = {C: i for i, C in enumerate(permutation_iter(N))}

    def sample_fn():
        sample = permutation_canonical(_sample_crp(N, alpha))
        return idmap[tuple(sample)]
    assert_discrete_dist_approx(sample_fn, dist, ntries=100)
Ejemplo n.º 41
0
def test_dm_cxx():
    K = 4
    Y = np.array([
        ([0, 1, 2, 5],),
        ([1, 0, 1, 2],),
        ([0, 2, 9, 9],),
    ], dtype=[('', np.int, (K,))])
    Y_np = np.vstack(y[0] for y in Y)

    cxx_view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(K)])
    prior = {'alphas': [1.] * K}
    cxx_s = cxx_initialize(
        defn,
        cxx_view,
        r,
        feature_hps=[prior],
        assignment=[0] * Y.shape[0])

    counts = cxx_s.get_suffstats(0, 0)['counts']
    assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
Ejemplo n.º 42
0
def _test_serializer(initialize_fn, deserialize_fn, dataview):
    N = 10
    R = rng()

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn': defn,
        'data': dataview(data),
        'cluster_hp': {'alpha': 10.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    state = initialize_fn(**init_args)

    raw = state.serialize()

    state1 = deserialize_fn(defn, raw)
    assert state1 is not None

    bstr = pickle.dumps(state)
    state2 = pickle.loads(bstr)
    assert state2 is not None
Ejemplo n.º 43
0
def test_dm_cxx():
    K = 4
    Y = np.array([
        ([0, 1, 2, 5], ),
        ([1, 0, 1, 2], ),
        ([0, 2, 9, 9], ),
    ],
                 dtype=[('', np.int, (K, ))])
    Y_np = np.vstack(y[0] for y in Y)

    cxx_view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(K)])
    prior = {'alphas': [1.] * K}
    cxx_s = cxx_initialize(defn,
                           cxx_view,
                           r,
                           feature_hps=[prior],
                           assignment=[0] * Y.shape[0])

    counts = cxx_s.get_suffstats(0, 0)['counts']
    assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
Ejemplo n.º 44
0
def _test_runner_kernel_config(kc_fn, models):
    defn = model_definition(10, models)
    Y = toy_dataset(defn)
    view = numpy_dataview(Y)
    kc = kc_fn(defn)
    prng = rng()

    ntries = 5
    while ntries:
        latent = model.initialize(defn, view, prng)
        assignments = latent.assignments()
        r = runner.runner(defn, view, latent, kc)
        r.run(r=prng, niters=10)
        assignments1 = r.get_latent().assignments()

        # XXX: it should be very unlikely the assignments are all equal
        if assignments == assignments1:
            ntries -= 1
        else:
            return  # success

    assert_true(False)  # exceeded ntries
def _test_multivariate_models(initialize_fn, dataview, bind, gibbs_assign, R):
    # XXX: this test only checks that the operations don't crash
    mu = np.ones(3)
    kappa = 0.3
    Q = random_orthonormal_matrix(3)
    psi = np.dot(Q, np.dot(np.diag([1.0, 0.5, 0.2]), Q.T))
    nu = 6

    N = 10

    def genrow():
        return tuple([
            np.random.choice([False, True]),
            [np.random.uniform(-3.0, 3.0) for _ in xrange(3)]
        ])

    X = np.array([genrow() for _ in xrange(N)],
                 dtype=[('', bool), ('', float, (3, ))])
    view = dataview(X)

    defn = model_definition(N, [bb, niw(3)])
    s = initialize_fn(defn,
                      view,
                      cluster_hp={'alpha': 2.},
                      feature_hps=[{
                          'alpha': 2.,
                          'beta': 2.
                      }, {
                          'mu': mu,
                          'kappa': kappa,
                          'psi': psi,
                          'nu': nu
                      }],
                      r=R)

    bound_s = bind(s, view)
    for _ in xrange(10):
        gibbs_assign(bound_s, R)
def _test_multivariate_models(initialize_fn,
                              dataview,
                              bind,
                              gibbs_assign,
                              R):
    # XXX: this test only checks that the operations don't crash
    mu = np.ones(3)
    kappa = 0.3
    Q = random_orthonormal_matrix(3)
    psi = np.dot(Q, np.dot(np.diag([1.0, 0.5, 0.2]), Q.T))
    nu = 6

    N = 10

    def genrow():
        return tuple(
            [np.random.choice([False, True]),
             [np.random.uniform(-3.0, 3.0) for _ in xrange(3)]])
    X = np.array([genrow()
                  for _ in xrange(N)], dtype=[('', bool), ('', float, (3,))])
    view = dataview(X)

    defn = model_definition(N, [bb, niw(3)])
    s = initialize_fn(
        defn,
        view,
        cluster_hp={'alpha': 2.},
        feature_hps=[
            {'alpha': 2., 'beta': 2.},
            {'mu': mu, 'kappa': kappa, 'psi': psi, 'nu': nu}
        ],
        r=R)

    bound_s = bind(s, view)
    for _ in xrange(10):
        gibbs_assign(bound_s, R)
Ejemplo n.º 47
0
def test_mnist_supervised():
    mnist_dataset = _get_mnist_dataset()
    classes = range(10)
    classmap = {c: i for i, c in enumerate(classes)}
    train_data, test_data = [], []
    for c in classes:
        Y = mnist_dataset['data'][
            np.where(mnist_dataset['target'] == float(c))[0]]
        Y_train, Y_test = train_test_split(Y, test_size=0.01)
        train_data.append(Y_train)
        test_data.append(Y_test)

    sample_size_max = 10000

    def mk_class_data(c, Y):
        n, D = Y.shape
        print 'number of digit', c, 'in training is', n
        dtype = [('', bool)] * D + [('', int)]
        inds = np.random.permutation(Y.shape[0])[:sample_size_max]
        Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]],
                     dtype=dtype)
        return Y
    Y_train = np.hstack([mk_class_data(c, y)
                         for c, y in zip(classes, train_data)])
    Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))]

    n, = Y_train.shape
    D = len(Y_train.dtype)
    print 'training data is', n, 'examples'
    print 'image dimension is', (D - 1), 'pixels'

    view = numpy_dataview(Y_train)
    defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))])
    r = rng()
    s = initialize(defn,
                   view,
                   cluster_hp={'alpha': 0.2},
                   feature_hps=[{'alpha': 1., 'beta': 1.}] *
                   (D - 1) + [{'alphas': [1. for _ in classes]}],
                   r=r)

    bound_s = bind(s, view)

    indiv_prior_fn = log_exponential(1.2)
    hparams = {
        i: {
            'alpha': (indiv_prior_fn, 1.5),
            'beta': (indiv_prior_fn, 1.5),
        } for i in xrange(D - 1)}
    hparams[D - 1] = {
        'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5)
        for idx in xrange(len(classes))
    }

    def print_prediction_results():
        results = []
        for c, Y_test in zip(classes, test_data):
            for y in Y_test:
                query = ma.masked_array(
                    np.array([tuple(y) + (0,)],
                             dtype=[('', bool)] * (D - 1) + [('', int)]),
                    mask=[(False,) * (D - 1) + (True,)])[0]
                samples = [
                    s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)]
                samples = np.bincount(samples, minlength=len(classes))
                prediction = np.argmax(samples)
                results.append((classmap[c], prediction, samples))
            print 'finished predictions for class', c

        Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
        Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
        print 'accuracy:', accuracy_score(Y_actual, Y_pred)
        print 'confusion matrix:'
        print confusion_matrix(Y_actual, Y_pred)

        # AUROC for one vs all (each class)
        for i, clabel in enumerate(classes):
            Y_true = np.copy(Y_actual)

            # treat class c as the "positive" example
            positive_examples = Y_actual == i
            negative_examples = Y_actual != i
            Y_true[positive_examples] = 1
            Y_true[negative_examples] = 0
            Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
            cls_auc = roc_auc_score(Y_true, Y_prob)
            print 'class', clabel, 'auc=', cls_auc

        #import matplotlib.pylab as plt
        #Y_prob = np.array([c for _, _, c in results])
        #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0)
        #plt.plot(fpr, tpr)
        #plt.show()

    def kernel(rid):
        start0 = time.time()
        assign(bound_s, r)
        sec0 = time.time() - start0

        start1 = time.time()
        hp(bound_s, r, hparams=hparams)
        sec1 = time.time() - start1

        print 'rid=', rid, 'nclusters=', s.ngroups(), \
            'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'

        sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
        print '  time_per_post_pred=', sec_per_post_pred, 'sec'

        # print group size breakdown
        sizes = [(gid, s.groupsize(gid)) for gid in s.groups()]
        sizes = sorted(sizes, key=lambda x: x[1], reverse=True)
        print '  group_sizes=', sizes

        print_prediction_results()

        # save state
        mkdirp("mnist-states")
        fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid))
        with open(fname, "w") as fp:
            fp.write(s.serialize())

    # training
    iters = 30
    for rid in xrange(iters):
        kernel(rid)
Ejemplo n.º 48
0
def test_mnist_supervised(n):
    mnist_dataset = _get_mnist_dataset()
    classes = range(10)
    classmap = {c: i for i, c in enumerate(classes)}
    train_data, test_data = [], []
    for c in classes:
        Y = mnist_dataset['data'][np.where(
            mnist_dataset['target'] == float(c))[0]]
        Y_train, Y_test = train_test_split(Y, test_size=0.01)
        train_data.append(Y_train)
        test_data.append(Y_test)

    sample_size_max = n

    def mk_class_data(c, Y):
        n, D = Y.shape
        print 'number of digit', c, 'in training is', n
        dtype = [('', bool)] * D + [('', int)]
        inds = np.random.permutation(Y.shape[0])[:sample_size_max]
        Y = np.array([tuple(list(y) + [classmap[c]]) for y in Y[inds]],
                     dtype=dtype)
        return Y

    Y_train = np.hstack(
        [mk_class_data(c, y) for c, y in zip(classes, train_data)])
    Y_train = Y_train[np.random.permutation(np.arange(Y_train.shape[0]))]

    n, = Y_train.shape
    D = len(Y_train.dtype)
    print 'training data is', n, 'examples'
    print 'image dimension is', (D - 1), 'pixels'

    view = numpy_dataview(Y_train)
    defn = model_definition(n, [bb] * (D - 1) + [dd(len(classes))])
    r = rng()
    s = initialize(defn,
                   view,
                   cluster_hp={'alpha': 0.2},
                   feature_hps=[{
                       'alpha': 1.,
                       'beta': 1.
                   }] * (D - 1) + [{
                       'alphas': [1. for _ in classes]
                   }],
                   r=r)

    bound_s = bind(s, view)

    indiv_prior_fn = log_exponential(1.2)
    hparams = {
        i: {
            'alpha': (indiv_prior_fn, 1.5),
            'beta': (indiv_prior_fn, 1.5),
        }
        for i in xrange(D - 1)
    }
    hparams[D - 1] = {
        'alphas[{}]'.format(idx): (indiv_prior_fn, 1.5)
        for idx in xrange(len(classes))
    }

    def print_prediction_results():
        results = []
        for c, Y_test in zip(classes, test_data):
            for y in Y_test:
                query = ma.masked_array(
                    np.array([tuple(y) + (0, )],
                             dtype=[('', bool)] * (D - 1) + [('', int)]),
                    mask=[(False, ) * (D - 1) + (True, )])[0]
                samples = [
                    s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)
                ]
                samples = np.bincount(samples, minlength=len(classes))
                prediction = np.argmax(samples)
                results.append((classmap[c], prediction, samples))
            print 'finished predictions for class', c

        Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
        Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
        print 'accuracy:', accuracy_score(Y_actual, Y_pred)
        print 'confusion matrix:'
        print confusion_matrix(Y_actual, Y_pred)

        # AUROC for one vs all (each class)
        for i, clabel in enumerate(classes):
            Y_true = np.copy(Y_actual)

            # treat class c as the "positive" example
            positive_examples = Y_actual == i
            negative_examples = Y_actual != i
            Y_true[positive_examples] = 1
            Y_true[negative_examples] = 0
            Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
            cls_auc = roc_auc_score(Y_true, Y_prob)
            print 'class', clabel, 'auc=', cls_auc

        #import matplotlib.pylab as plt
        #Y_prob = np.array([c for _, _, c in results])
        #fpr, tpr, thresholds = roc_curve(Y_actual, Y_prob, pos_label=0)
        #plt.plot(fpr, tpr)
        #plt.show()

    def kernel(rid):
        start0 = time.time()
        assign(bound_s, r)
        sec0 = time.time() - start0

        start1 = time.time()
        hp(bound_s, r, hparams=hparams)
        sec1 = time.time() - start1

        print 'rid=', rid, 'nclusters=', s.ngroups(), \
            'iter0=', sec0, 'sec', 'iter1=', sec1, 'sec'

        sec_per_post_pred = sec0 / (float(view.size()) * (float(s.ngroups())))
        print '  time_per_post_pred=', sec_per_post_pred, 'sec'

    # training
    iters = 30
    for rid in xrange(iters):
        kernel(rid)

    # print group size breakdown
    sizes = [(gid, s.groupsize(gid)) for gid in s.groups()]
    sizes = sorted(sizes, key=lambda x: x[1], reverse=True)
    print '  group_sizes=', sizes

    #print_prediction_results()

    # save state
    mkdirp("mnist-states")
    fname = os.path.join("mnist-states", "state-iter{}.ser".format(rid))
    with open(fname, "w") as fp:
        fp.write(s.serialize())