Exemple #1
0
def generate_synth_data(V=10, D=10, T=5, N=100, alpha_beta=10., alpha_theta=10., plot=False, train_frac=0.5):
    # true_lda = StandardLDA(T, V, alpha_beta=alpha_beta, alpha_theta=alpha_theta)
    true_lda = LogisticNormalCorrelatedLDA(T, V, alpha_beta=alpha_beta)

    print("Sigma: ", true_lda.Sigma)

    # true_lda = StickbreakingCorrelatedLDA(T, V, alpha_beta=alpha_beta)
    data = np.zeros((D,V),dtype=int)
    for d in xrange(D):
        doc = true_lda.generate(N=N, keep=True)
        data[d,:] = doc.w

    if plot:
        plt.figure()
        plt.imshow(data, interpolation="none")
        plt.xlabel("Vocabulary")
        plt.ylabel("Documents")
        plt.colorbar()
        plt.show()

    # Split each document into two
    train_data = np.zeros_like(data)
    test_data = np.zeros_like(data)
    for d,w in enumerate(data):
        # Get vector where i is repeated w[i] times
        wcnt = ibincount(w)

        # Subsample wcnt
        train_inds = np.random.rand(wcnt.size) < train_frac
        train_data[d] = np.bincount(wcnt[train_inds], minlength=V)
        test_data[d]  = np.bincount(wcnt[~train_inds], minlength=V)

        assert np.allclose(train_data[d] + test_data[d], w)

    return true_lda, train_data, test_data
def generate_dataset(bias=1.):
    # Create the model with these parameters
    network_hypers = {
        'C': 1,
        'kappa': 1.,
        'c': np.zeros(K, dtype=np.int),
        'p': 1 * np.ones((1, 1)),
        'v': 100.
    }
    bkgd_hypers = {"alpha": 3., "beta": 3. / bias}
    dt_model = pyhawkes.models.\
        DiscreteTimeNetworkHawkesModelSpikeAndSlab(K=K, dt=dt, dt_max=dt_max, B=B,
                                                   bkgd_hypers=bkgd_hypers,
                                                   network_hypers=network_hypers)
    # dt_model.bias_model.lambda0 = bias * np.ones(K)
    assert dt_model.check_stability()

    S_dt, _ = dt_model.generate(T=int(np.ceil(T / dt)), keep=False)

    print "sampled dataset with ", S_dt.sum(), "events"

    # Convert S_dt to continuous time
    S_ct = dt * np.concatenate([ibincount(S) for S in S_dt.T]).astype(float)
    S_ct += dt * np.random.rand(*S_ct.shape)
    assert np.all(S_ct < T)
    C_ct = np.concatenate([k * np.ones(S.sum())
                           for k, S in enumerate(S_dt.T)]).astype(int)

    # Sort the data
    perm = np.argsort(S_ct)
    S_ct = S_ct[perm]
    C_ct = C_ct[perm]

    return S_dt, S_ct, C_ct
def generate_dataset(bias=1.):
    # Create the model with these parameters
    network_hypers = {'C': 1, 'kappa': 1., 'c': np.zeros(K, dtype=np.int), 'p': 1*np.ones((1,1)), 'v': 100.}
    bkgd_hypers = {"alpha": 3., "beta": 3./bias}
    dt_model = pyhawkes.models.\
        DiscreteTimeNetworkHawkesModelSpikeAndSlab(K=K, dt=dt, dt_max=dt_max, B=B,
                                                   bkgd_hypers=bkgd_hypers,
                                                   network_hypers=network_hypers)
    # dt_model.bias_model.lambda0 = bias * np.ones(K)
    assert dt_model.check_stability()

    S_dt,_ = dt_model.generate(T=int(np.ceil(T/dt)), keep=False)

    print "sampled dataset with ", S_dt.sum(), "events"

    # Convert S_dt to continuous time
    S_ct = dt * np.concatenate([ibincount(S) for S in S_dt.T]).astype(float)
    S_ct += dt * np.random.rand(*S_ct.shape)
    assert np.all(S_ct < T)
    C_ct = np.concatenate([k*np.ones(S.sum()) for k,S in enumerate(S_dt.T)]).astype(int)

    # Sort the data
    perm = np.argsort(S_ct)
    S_ct = S_ct[perm]
    C_ct = C_ct[perm]

    return S_dt, S_ct, C_ct
Exemple #4
0
def convert_discrete_to_continuous(S, dt):
    # Convert S to continuous time
    from pybasicbayes.util.general import ibincount
    T = S.shape[0] * dt
    S_ct = dt * np.concatenate([ibincount(Sk) for Sk in S.T]).astype(float)
    S_ct += dt * np.random.rand(*S_ct.shape)
    assert np.all(S_ct < T)
    C_ct = np.concatenate([k*np.ones(Sk.sum()) for k,Sk in enumerate(S.T)]).astype(int)

    # Sort the data
    perm = np.argsort(S_ct)
    S_ct = S_ct[perm]
    C_ct = C_ct[perm]
    return S_ct, C_ct, T
Exemple #5
0
def convert_discrete_to_continuous(S, dt):
    # Convert S to continuous time
    from pybasicbayes.util.general import ibincount
    T = S.shape[0] * dt
    S_ct = dt * np.concatenate([ibincount(Sk) for Sk in S.T]).astype(float)
    S_ct += dt * np.random.rand(*S_ct.shape)
    assert np.all(S_ct < T)
    C_ct = np.concatenate([k*np.ones(Sk.sum()) for k,Sk in enumerate(S.T)]).astype(int)

    # Sort the data
    perm = np.argsort(S_ct)
    S_ct = S_ct[perm]
    C_ct = C_ct[perm]
    return S_ct, C_ct, T
Exemple #6
0
def downsample_data(X, n):
    """
    Downsample each row of X such that it sums to n by randomly removing entries
    """
    from pybasicbayes.util.general import ibincount
    assert X.ndim == 2
    D,K = X.shape

    Xsub = X.copy().astype(np.int)

    for d in xrange(D):
        xi = ibincount(Xsub[d])
        Xsub[d] = np.bincount(np.random.choice(xi, size=n, replace=False), minlength=K)

        assert Xsub[d].sum() == n

    return Xsub.astype(np.float)
Exemple #7
0
def downsample_data(X, n):
    """
    Downsample each row of X such that it sums to n by randomly removing entries
    """
    from pybasicbayes.util.general import ibincount
    assert X.ndim == 2
    D,K = X.shape

    Xsub = X.copy().astype(np.int)

    for d in range(D):
        xi = ibincount(Xsub[d])
        Xsub[d] = np.bincount(np.random.choice(xi, size=n, replace=False), minlength=K)

        assert Xsub[d].sum() == n

    return Xsub.astype(np.float)
Exemple #8
0
dt_max = 10.
T = 100.
network_hypers = {'kappa': 1., 'p': 1., 'v': 10.}
dt_model = pyhawkes.models.\
    DiscreteTimeNetworkHawkesModelSpikeAndSlab(K=K, dt=dt, dt_max=dt_max, B=B,
                                               network_hypers=network_hypers)
assert dt_model.check_stability()

S_dt, _ = dt_model.generate(T=int(np.ceil(T / dt)), keep=False)

print("sampled dataset with ", S_dt.sum(), "events")

print("DT LL: ", dt_model.heldout_log_likelihood(S_dt))

# Convert S_test to continuous time
S_ct = dt * np.concatenate([ibincount(S) for S in S_dt.T]).astype(float)
S_ct += dt * np.random.rand(*S_ct.shape)
assert np.all(S_ct < T)
C_ct = np.concatenate([k * np.ones(S.sum())
                       for k, S in enumerate(S_dt.T)]).astype(int)

# Sort the data
perm = np.argsort(S_ct)
S_ct = S_ct[perm]
C_ct = C_ct[perm]

ct_model = pyhawkes.models.ContinuousTimeNetworkHawkesModel(
    K, dt_max=1., network_hypers=network_hypers)
ct_model.add_data(S_ct, C_ct, T)
# ct.resample_model()
dt_max = 10.
T = 100.
network_hypers = {'kappa': 1., 'p': 1., 'v': 10.}
dt_model = pyhawkes.models.\
    DiscreteTimeNetworkHawkesModelSpikeAndSlab(K=K, dt=dt, dt_max=dt_max, B=B,
                                               network_hypers=network_hypers)
assert dt_model.check_stability()

S_dt,_ = dt_model.generate(T=int(np.ceil(T/dt)), keep=False)

print("sampled dataset with ", S_dt.sum(), "events")

print("DT LL: ", dt_model.heldout_log_likelihood(S_dt))

# Convert S_test to continuous time
S_ct = dt * np.concatenate([ibincount(S) for S in S_dt.T]).astype(float)
S_ct += dt * np.random.rand(*S_ct.shape)
assert np.all(S_ct < T)
C_ct = np.concatenate([k*np.ones(S.sum()) for k,S in enumerate(S_dt.T)]).astype(int)

# Sort the data
perm = np.argsort(S_ct)
S_ct = S_ct[perm]
C_ct = C_ct[perm]

ct_model = pyhawkes.models.ContinuousTimeNetworkHawkesModel(K, dt_max=1.,
                                                            network_hypers=network_hypers)
ct_model.add_data(S_ct, C_ct, T)
# ct.resample_model()

# Hard code parameters