Example #1
0
def test_operations():
    N = 10
    R = rng(12)

    def mkrow():
        return (np.random.choice([False,
                                  True]), np.random.choice([False, True]),
                np.random.random(), np.random.choice([False, True]))

    dtype = [('', bool), ('', bool), ('', float), ('', bool)]
    # non-masked data
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bb, nich, bb])
    init_args = {
        'defn':
        defn,
        'cluster_hp': {
            'alpha': 2.0
        },
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
        ],
        'r':
        R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # *_initialize() randomly assigns all entities to a group, so we'll have to
    # unset this assignment for this test
    unset(cxx_s, data, R)

    ensure_k_groups(cxx_s, 3, R)

    assert cxx_s.nentities() == N

    cxx_s.dcheck_consistency()

    assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in it.islice(enumerate(data), 2):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()

    newrow = mkrow()
    newdata = np.array([newrow], dtype=dtype)

    cxx_score = cxx_s.score_value(newdata[0], R)
    assert cxx_score is not None
    cxx_s.dcheck_consistency()
Example #2
0
def test_sample_post_pred():
    N = 10
    R = rng(5483932)
    D = 4

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return tuple(randombool() for _ in xrange(D))

    dtype = [('', bool)] * D
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb] * D)
    init_args = {
        'defn': defn,
        'cluster_hp': {
            'alpha': 2.0
        },
        'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    G = 3
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % G
        cxx_s.add_value(egid, i, yi, R)

    # sample
    y_new_data = mkrow()
    y_new_mask = tuple(randombool() for _ in xrange(D))
    y_new = ma.masked_array(np.array([y_new_data], dtype=dtype),
                            mask=[y_new_mask])[0]

    n_samples = 1000

    cxx_samples = np.hstack(
        [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])

    idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}

    def todist(samples):
        dist = np.zeros(len(idmap))
        for s in samples:
            dist[idmap[tuple(s)]] += 1.0
        dist /= dist.sum()
        return dist

    cxx_dist = todist(cxx_samples)
    assert cxx_dist is not None
Example #3
0
def test_operations():
    N = 10
    R = rng(12)

    def mkrow():
        return (np.random.choice([False, True]),
                np.random.choice([False, True]),
                np.random.random(),
                np.random.choice([False, True]))
    dtype = [('', bool), ('', bool), ('', float), ('', bool)]
    # non-masked data
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb, bb, nich, bb])
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
            dist_bb.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # *_initialize() randomly assigns all entities to a group, so we'll have to
    # unset this assignment for this test
    unset(cxx_s, data, R)

    ensure_k_groups(cxx_s, 3, R)

    assert cxx_s.nentities() == N

    cxx_s.dcheck_consistency()

    assert cxx_s.ngroups() == 3 and set(cxx_s.empty_groups()) == set([0, 1, 2])

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in it.islice(enumerate(data), 2):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()

    newrow = mkrow()
    newdata = np.array([newrow], dtype=dtype)

    cxx_score = cxx_s.score_value(newdata[0], R)
    assert cxx_score is not None
    cxx_s.dcheck_consistency()
Example #4
0
def test_sample_post_pred():
    N = 10
    R = rng(5483932)
    D = 4

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return tuple(randombool() for _ in xrange(D))
    dtype = [('', bool)] * D
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)

    defn = model_definition(N, [bb] * D)
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 2.0},
        'feature_hps': [dist_bb.EXAMPLES[0]['shared']] * D,
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    G = 3
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % G
        cxx_s.add_value(egid, i, yi, R)

    # sample
    y_new_data = mkrow()
    y_new_mask = tuple(randombool() for _ in xrange(D))
    y_new = ma.masked_array(
        np.array([y_new_data], dtype=dtype),
        mask=[y_new_mask])[0]

    n_samples = 1000

    cxx_samples = np.hstack(
        [cxx_s.sample_post_pred(y_new, R)[1] for _ in xrange(n_samples)])

    idmap = {C: i for i, C in enumerate(it.product([False, True], repeat=D))}

    def todist(samples):
        dist = np.zeros(len(idmap))
        for s in samples:
            dist[idmap[tuple(s)]] += 1.0
        dist /= dist.sum()
        return dist

    cxx_dist = todist(cxx_samples)
    assert cxx_dist is not None
Example #5
0
 def score_dataset(counts):
     M, K = counts.shape
     Y = np.array([(y, ) for y in counts], dtype=[('', np.int, (K, ))])
     view = cxx_numpy_dataview(Y)
     r = rng()
     defn = model_definition(M, [dm(K)])
     prior = {'alphas': [1.] * K}
     s = cxx_initialize(defn,
                        view,
                        r,
                        feature_hps=[prior],
                        assignment=[0] * M)
     assert_equals(s.groups(), [0])
     return s.score_data(None, None, r)
Example #6
0
def test_masked_operations():
    N = 10
    R = rng(2347785)

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())

    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)
    mask = [mkmask() for _ in xrange(N)]
    data = ma.masked_array(data, mask=mask)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn':
        defn,
        'cluster_hp': {
            'alpha': 10.0
        },
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r':
        R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # see comment above
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in enumerate(data):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()
Example #7
0
 def score_dataset(counts):
     M, K = counts.shape
     Y = np.array([(y,) for y in counts], dtype=[('', np.int, (K,))])
     view = cxx_numpy_dataview(Y)
     r = rng()
     defn = model_definition(M, [dm(K)])
     prior = {'alphas': [1.] * K}
     s = cxx_initialize(
         defn,
         view,
         r,
         feature_hps=[prior],
         assignment=[0] * M)
     assert_equals(s.groups(), [0])
     return s.score_data(None, None, r)
Example #8
0
def test_masked_operations():
    N = 10
    R = rng(2347785)

    dtype = [('', bool), ('', int), ('', float)]

    def randombool():
        return np.random.choice([False, True])

    def mkrow():
        return (randombool(), np.random.randint(1, 10), np.random.random())

    def mkmask():
        return (randombool(), randombool(), randombool())
    data = [mkrow() for _ in xrange(N)]
    data = np.array(data, dtype=dtype)
    mask = [mkmask() for _ in xrange(N)]
    data = ma.masked_array(data, mask=mask)

    defn = model_definition(N, [bb, bnb, nich])
    init_args = {
        'defn': defn,
        'cluster_hp': {'alpha': 10.0},
        'feature_hps': [
            dist_bb.EXAMPLES[0]['shared'],
            dist_bnb.EXAMPLES[0]['shared'],
            dist_nich.EXAMPLES[0]['shared'],
        ],
        'r': R,
    }
    cxx_s = cxx_initialize(data=cxx_numpy_dataview(data), **init_args)

    # see comment above
    unset(cxx_s, data, R)
    ensure_k_groups(cxx_s, 3, R)

    for i, yi in enumerate(data):
        egid = i % 2
        cxx_s.add_value(egid, i, yi, R)
        cxx_s.dcheck_consistency()

    for i, yi in enumerate(data):
        cxx_s.remove_value(i, yi, R)
        cxx_s.dcheck_consistency()
Example #9
0
def test_dm_cxx():
    K = 4
    Y = np.array([
        ([0, 1, 2, 5], ),
        ([1, 0, 1, 2], ),
        ([0, 2, 9, 9], ),
    ],
                 dtype=[('', np.int, (K, ))])
    Y_np = np.vstack(y[0] for y in Y)

    cxx_view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(K)])
    prior = {'alphas': [1.] * K}
    cxx_s = cxx_initialize(defn,
                           cxx_view,
                           r,
                           feature_hps=[prior],
                           assignment=[0] * Y.shape[0])

    counts = cxx_s.get_suffstats(0, 0)['counts']
    assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
Example #10
0
def test_dm_cxx():
    K = 4
    Y = np.array([
        ([0, 1, 2, 5],),
        ([1, 0, 1, 2],),
        ([0, 2, 9, 9],),
    ], dtype=[('', np.int, (K,))])
    Y_np = np.vstack(y[0] for y in Y)

    cxx_view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(K)])
    prior = {'alphas': [1.] * K}
    cxx_s = cxx_initialize(
        defn,
        cxx_view,
        r,
        feature_hps=[prior],
        assignment=[0] * Y.shape[0])

    counts = cxx_s.get_suffstats(0, 0)['counts']
    assert_sequence_equal(counts, list(Y_np.sum(axis=0)))
Example #11
0
def test_betabin_equiv():

    # https://github.com/pymc-devs/pymc/blob/
    # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84
    def betabin_like(value, alpha, beta, n):
        return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) +
                gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) +
                gammaln(alpha + value) + gammaln(n + beta - value) -
                gammaln(beta + alpha + n))

    # this N refers to the number of trials in the binomial distribution
    N = 10

    # this refers to the dataset size
    M = 100

    # hyperparams of the beta dist
    alpha, beta = 1., 2.

    heads = np.random.randint(low=0, high=N + 1, size=M)
    tails = N - heads

    data = np.vstack((heads, tails)).T

    Y = np.array([(y, ) for y in data], dtype=[('', np.int, (2, ))])
    view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(2)])
    prior = {'alphas': [alpha, beta]}
    s = cxx_initialize(defn,
                       view,
                       r,
                       feature_hps=[prior],
                       assignment=[0] * Y.shape[0])

    assert_equals(s.groups(), [0])

    def all_indices(N):
        for i, j in it.product(range(0, N + 1), repeat=2):
            if (i + j) == N:
                yield i, j

    all_data = [(list(ij), ) for ij in all_indices(N)]

    Y_test = np.array(all_data, dtype=[('', np.int, (2, ))])

    # the actual score is simply a betabin using the updated alpha, beta
    alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0)

    def model_score(Y_value):
        _, (score, ) = s.score_value(Y_value, r)
        return score

    def test_score(Y_value):
        score = betabin_like(Y_value[0][0], alpha1, beta1, N)
        return score

    model_scores = np.array(map(model_score, Y_test))
    test_scores = np.array(map(test_score, Y_test))

    assert_almost_equals(np.exp(model_scores).sum(), 1., places=2)
    assert_almost_equals(np.exp(test_scores).sum(), 1., places=2)
    assert_almost_equals(np.abs(model_scores - test_scores).max(),
                         0.,
                         places=1)
Example #12
0
def test_betabin_equiv():

    # https://github.com/pymc-devs/pymc/blob/
    # a7ab153f2b58d81824a56166747c678d7f421bde/pymc/distributions/discrete.py#L84
    def betabin_like(value, alpha, beta, n):
        return (gammaln(alpha + beta) - gammaln(alpha) - gammaln(beta) +
                gammaln(n + 1) - gammaln(value + 1) - gammaln(n - value + 1) +
                gammaln(alpha + value) + gammaln(n + beta - value) -
                gammaln(beta + alpha + n))

    # this N refers to the number of trials in the binomial distribution
    N = 10

    # this refers to the dataset size
    M = 100

    # hyperparams of the beta dist
    alpha, beta = 1., 2.

    heads = np.random.randint(low=0, high=N + 1, size=M)
    tails = N - heads

    data = np.vstack((heads, tails)).T

    Y = np.array([(y,) for y in data], dtype=[('', np.int, (2,))])
    view = cxx_numpy_dataview(Y)
    r = rng()
    defn = model_definition(Y.shape[0], [dm(2)])
    prior = {'alphas': [alpha, beta]}
    s = cxx_initialize(
        defn,
        view,
        r,
        feature_hps=[prior],
        assignment=[0] * Y.shape[0])

    assert_equals(s.groups(), [0])

    def all_indices(N):
        for i, j in it.product(range(0, N + 1), repeat=2):
            if (i + j) == N:
                yield i, j

    all_data = [(list(ij),) for ij in all_indices(N)]

    Y_test = np.array(all_data, dtype=[('', np.int, (2,))])

    # the actual score is simply a betabin using the updated alpha, beta
    alpha1, beta1 = np.array([alpha, beta]) + data.sum(axis=0)

    def model_score(Y_value):
        _, (score,) = s.score_value(Y_value, r)
        return score

    def test_score(Y_value):
        score = betabin_like(Y_value[0][0], alpha1, beta1, N)
        return score

    model_scores = np.array(map(model_score, Y_test))
    test_scores = np.array(map(test_score, Y_test))

    assert_almost_equals(np.exp(model_scores).sum(), 1., places=2)
    assert_almost_equals(np.exp(test_scores).sum(), 1., places=2)
    assert_almost_equals(
        np.abs(model_scores - test_scores).max(), 0., places=1)