Esempio n. 1
0
def run(n_documents=30, n_topics=5, n_vocabulary=10, n_words=50000, stochastic=False, maxiter=1000, seed=None):

    if seed is not None:
        np.random.seed(seed)

    (corpus, word_documents) = generate_data(n_documents, n_topics, n_vocabulary, n_words)

    if not stochastic:

        Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary,
                  corpus=corpus, word_documents=word_documents)

        Q.update(repeat=maxiter)

    else:

        subset_size = 1000

        Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary,
                  corpus=corpus[:subset_size], word_documents=word_documents[:subset_size],
                  plates_multiplier=n_words/subset_size)

        Q.ignore_bound_checks = True
        delay = 1
        forgetting_rate = 0.7
        for n in range(maxiter):

            # Observe a mini-batch
            subset = np.random.choice(n_words, subset_size)
            Q['words'].observe(corpus[subset])
            Q['word_documents'].set_value(word_documents[subset])

            # Learn intermediate variables
            Q.update('topics')

            # Set step length
            step = (n + delay) ** (-forgetting_rate)

            # Stochastic gradient for the global variables
            Q.gradient_step('p_topic', 'p_word', scale=step)

        bpplt.pyplot.figure()
        bpplt.pyplot.plot(Q.L)


    bpplt.pyplot.figure()
    bpplt.hinton(Q['p_topic'])
    bpplt.pyplot.title("Posterior topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(Q['p_word'])
    bpplt.pyplot.title("Posterior word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return
Esempio n. 2
0
    def _run(self, x, K=25, beta=0.5, alpha=0.00001, hinton_plot=False, end=False):
        '''Only to be used when doing parameter optimization.'''

        self.participant_list = x[0]
        
        N = len(x[0])            #number of data points (i.e. WCS participants)
        D = np.shape(x[1])[1]    #number of features
        #K = 20            #number of initial clusters
        
        R = Dirichlet(K*[alpha],
                      name='R')
        Z = Categorical(R,
                        plates=(N,1),
                        name='Z')
        
        P = Beta([beta, beta],
                 plates=(D,K),
                 name='P')
        
        X = Mixture(Z, Bernoulli, P)
        
        Q = VB(Z, R, X, P)
        P.initialize_from_random()
        X.observe(x[1])
        Q.update(repeat=1000)

        log_likelihood = Q.L[Q.iter-1]

        if hinton_plot:
            bpplt.hinton(Z)
            bpplt.pyplot.show()
            
            bpplt.hinton(R)
            bpplt.pyplot.show()

        #Get the weight matrix stored in Z (weights determine which cluster data point belongs to)
        z = Z._message_to_child()[0]
        z = z * np.ones(Z.plates+(1,))
        z = np.squeeze(z)
        self.z = z

        #Get the weights stored in R (proportional to the size of the clusters)
        r = np.exp(R._message_to_child()[0])
        r = r * np.ones(R.plates+(1,))
        r = np.squeeze(r)
        self.r = r

        #Get the cluster assignment of each data point
        self.c_assign = np.argmax(self.z, axis=1)

        return log_likelihood
Esempio n. 3
0
def generate_data(n_documents, n_topics, n_vocabulary, n_words):

    # Generate random data from the generative model

    # Generate document assignments for the words
    word_documents = nodes.Categorical(np.ones(n_documents)/n_documents,
                                       plates=(n_words,)).random()

    # Topic distribution for each document
    p_topic = nodes.Dirichlet(1e-1*np.ones(n_topics),
                              plates=(n_documents,)).random()

    # Word distribution for each topic
    p_word = nodes.Dirichlet(1e-1*np.ones(n_vocabulary),
                             plates=(n_topics,)).random()

    # Topic for each word in each document
    topic = nodes.Categorical(p_topic[word_documents],
                              plates=(n_words,)).random()

    # Each word in each document
    corpus = nodes.Categorical(p_word[topic],
                               plates=(n_words,)).random()

    bpplt.pyplot.figure()
    bpplt.hinton(p_topic)
    bpplt.pyplot.title("True topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(p_word)
    bpplt.pyplot.title("True word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return (corpus, word_documents)
Esempio n. 4
0
def generate_data(n_documents, n_topics, n_vocabulary, n_words):

    # Generate random data from the generative model

    # Generate document assignments for the words
    word_documents = nodes.Categorical(np.ones(n_documents) / n_documents,
                                       plates=(n_words, )).random()

    # Topic distribution for each document
    p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics),
                              plates=(n_documents, )).random()

    # Word distribution for each topic
    p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary),
                             plates=(n_topics, )).random()

    # Topic for each word in each document
    topic = nodes.Categorical(p_topic[word_documents],
                              plates=(n_words, )).random()

    # Each word in each document
    corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random()

    bpplt.pyplot.figure()
    bpplt.hinton(p_topic)
    bpplt.pyplot.title("True topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(p_word)
    bpplt.pyplot.title("True word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return (corpus, word_documents)
Esempio n. 5
0
import numpy as np

mu = np.array([[0, 0], [3, 4], [6, 0]])
std = 2.0
K = 3
N = 200
p0 = np.ones(K) / K
q = 0.9
r = (1 - q) / (K - 1)
P = q * np.identity(K) + r * (np.ones((3, 3)) - np.identity(3))
y = np.zeros((N, 2))
z = np.zeros(N)
state = np.random.choice(K, p=p0)
for n in range(N):
    z[n] = state
    y[n, :] = std * np.random.randn(2) + mu[state]
    state = np.random.choice(K, p=P[state])
from bayespy.nodes import Dirichlet

a0 = Dirichlet(1e-3 * np.ones(K))
A = Dirichlet(1e-3 * np.ones((K, K)))
Z = CategoricalMarkovChain(a0, A, states=N)
Lambda = std**(-2) * np.identity(2)
from bayespy.nodes import Gaussian

Y = Mixture(Z, Gaussian, mu, Lambda)
Y.observe(y)
Q = VB(Y, Z, A, a0)
Q.update(repeat=1000)
bpplt.hinton(A)
bpplt.pyplot.show()
Esempio n. 6
0
    def run(self, K=25, beta=0.5, alpha=0.00001, foci_thresh=0, num_neigh=4, hinton_plot=False, end=False):
        '''Performs one run of the BBDP according to the specified parameters.'''

        print("Transforming WCS participant data into binary vectors...")
        x = u.transform_data_all(self.langs, norm=False, end=end, foci=True, foci_thresh=foci_thresh, num_neigh=num_neigh)
        print("Finished transforming participant data") 
        self.participant_list = x[0]
        
        N = len(x[0])            #number of data points (i.e. WCS participants)
        D = np.shape(x[1])[1]    #number of features
        #K = 20            #number of initial clusters
        
        R = Dirichlet(K*[alpha],
                      name='R')
        Z = Categorical(R,
                        plates=(N,1),
                        name='Z')
        
        P = Beta([beta, beta],
                 plates=(D,K),
                 name='P')
        
        X = Mixture(Z, Bernoulli, P)
        
        Q = VB(Z, R, X, P)
        P.initialize_from_random()
        X.observe(x[1])
        Q.update(repeat=1000)

        if hinton_plot:
            bpplt.hinton(Z)
            bpplt.pyplot.show()
            
            bpplt.hinton(R)
            bpplt.pyplot.show()

        #Get the weight matrix stored in Z (weights determine which cluster data point belongs to)
        z = Z._message_to_child()[0]
        z = z * np.ones(Z.plates+(1,))
        z = np.squeeze(z)
        self.z = z

        #Get the weights stored in R (proportional to the size of the clusters)
        r = np.exp(R._message_to_child()[0])
        r = r * np.ones(R.plates+(1,))
        r = np.squeeze(r)
        self.r = r

        #Get the cluster assignment of each data point
        self.c_assign = np.argmax(self.z, axis=1)

        #Write cluster results to a file
        if self.write_to_file:
            if end:
                save_path = "cluster_results_end_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh)
            else:
                save_path = "cluster_results_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh)
            while path.exists(save_path+".txt"):
                #save_path already exists
                try:
                    old_file_num = int(save_path[save_path.find('(')+1:-1])
                    new_file_num = old_file_num + 1
                    save_path = save_path[0:save_path.find('(')] + '(' + str(new_file_num) + ')'
                except ValueError:
                    save_path = save_path + " (1)"

            self.save_path = save_path       
            file = open(path.abspath(self.save_path+".txt"), 'w')
            
            #Write cluster assignment matrix Z (gives the probability that observation i belongs to cluster j)
            if 'Z' not in self.in_file:
                for i in range(len(self.z)):
                    line = "\t".join([str(x) for x in self.z[i]]) + "\n"
                    file.write(line)
                file.write('---Z\n')
                self.in_file.append('Z')

            #Write cluster weights matrix R (proportional to the size of the resulting clusters)
            if 'R' not in self.in_file:
                line = "\t".join([str(x) for x in self.r]) + "\n"
                file.write(line)
                file.write('---R\n')
                self.in_file.append('R')

            #Write deterministic cluster assignments with the corresponding participant key
            if 'C' not in self.in_file:
                line1 = "\t".join([str(x) for x in self.participant_list]) + "\n"
                line2 = "\t".join([str(x) for x in self.c_assign]) + "\n"              
                file.write(line1)
                file.write(line2)
                file.write('---C\n')
                self.in_file.append('C')
            
            file.close()

        return self.c_assign
Esempio n. 7
0
def test_hinton_plot_dirichlet():
    (R,P,Z) = _setup_bernoulli_mixture()
    bpplt.hinton(R)
#( 5 : Parameter expansion 収束が遅い時)
# from bayespy.inference.vmp import transformations
# rotX = transformations.RotateGaussianARD(X)
# rotC = transformations.RotateGaussianARD(C, alpha)
# R = transformations.RotationOptimizer(rotC, rotX, D)
# R.rotate()
# alpha.initialize_from_prior()
# C.initialize_from_prior()
# X.initialize_from_parameters(np.random.randn(1, 100, D), 10)
# tau.initialize_from_prior()
# Q = VB(Y, C, X, alpha, tau)
# Q.callback = R.rotate
# Q.update(repeat=1000, tol=1e-6)

#  -----Examining the results-----
# Plotting the results
bpplt.pyplot.figure()
bpplt.pdf(Q['tau'], np.linspace(60, 140, num=100))

V = Gaussian([3, 5], [[4, 2], [2, 5]])
bpplt.pyplot.figure()
bpplt.contour(V, np.linspace(1, 5, num=100), np.linspace(3, 7, num=100))

bpplt.pyplot.figure()
bpplt.hinton(C)

bpplt.pyplot.figure()
bpplt.plot(X, axis=-2)

bpplt.pyplot.show()
Esempio n. 9
0
import numpy
numpy.random.seed(1)
p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9]
p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9]
p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1]
import numpy as np
p = np.array([p0, p1, p2])
from bayespy.utils import random
z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100)
x = random.bernoulli(p[z])
N = 100
D = 10
K = 10
from bayespy.nodes import Categorical, Dirichlet
R = Dirichlet(K * [1e-5], name='R')
Z = Categorical(R, plates=(N, 1), name='Z')
from bayespy.nodes import Beta
P = Beta([0.5, 0.5], plates=(D, K), name='P')
from bayespy.nodes import Mixture, Bernoulli
X = Mixture(Z, Bernoulli, P)
from bayespy.inference import VB
Q = VB(Z, R, X, P)
P.initialize_from_random()
X.observe(x)
Q.update(repeat=1000)
import bayespy.plot as bpplt
bpplt.hinton(R)
bpplt.pyplot.show()
Esempio n. 10
0
import numpy
numpy.random.seed(1)
p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9]
p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9]
p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1]
import numpy as np
p = np.array([p0, p1, p2])
from bayespy.utils import random
z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100)
x = random.bernoulli(p[z])
N = 100
D = 10
K = 10
from bayespy.nodes import Categorical, Dirichlet
R = Dirichlet(K * [1e-5], name='R')
Z = Categorical(R, plates=(N, 1), name='Z')
from bayespy.nodes import Beta
P = Beta([0.5, 0.5], plates=(D, K), name='P')
from bayespy.nodes import Mixture, Bernoulli
X = Mixture(Z, Bernoulli, P)
from bayespy.inference import VB
Q = VB(Z, R, X, P)
P.initialize_from_random()
X.observe(x)
Q.update(repeat=1000)
import bayespy.plot as bpplt
bpplt.hinton(Z)
bpplt.pyplot.show()
Esempio n. 11
0
def test_hinton_plot_beta():
    (R, P, Z) = _setup_bernoulli_mixture()
    bpplt.hinton(P)
Esempio n. 12
0
def test_hinton_plot_dirichlet():
    (R, P, Z) = _setup_bernoulli_mixture()
    bpplt.hinton(R)
Esempio n. 13
0
def test_hinton_plot_categorical():
    (R, P, Z) = _setup_bernoulli_mixture()
    bpplt.hinton(Z)
Esempio n. 14
0
N = 100
D = 10
K = 3

from bayespy.nodes import Categorical, Dirichlet

R = Dirichlet(K * [1e-5], name='R')
Z = Categorical(R, plates=(N, 1), name='Z')

from bayespy.nodes import Beta
P = Beta([0.5, 0.5], plates=(D, K), name='P')

from bayespy.nodes import Mixture, Bernoulli
X = Mixture(Z, Bernoulli, P)

from bayespy.inference import VB
Q = VB(Z, R, X, P)
P.initialize_from_random()
X.observe(x)

Q.update(repeat=1000)
import bayespy.plot as bpplt
plt.figure()
bpplt.hinton(R)
plt.figure()
bpplt.hinton(P)
plt.figure()
bpplt.hinton(p)
plt.figure()
bpplt.hinton(Z)
Esempio n. 15
0
def test_hinton_plot_beta():
    (R,P,Z) = _setup_bernoulli_mixture()
    bpplt.hinton(P)
Esempio n. 16
0
Q.update(repeat=1000)

#print(" P:")
#print( P.get_moments() )

#print(" R:")
#print( R.get_moments() )

print(" Z:")
print( Z.get_moments() )

print(" X:")
print( X.get_moments() )


bpplt.hinton(R)
#bpplt.hinton(P)
#bpplt.hinton(Z)

bpplt.pyplot.show()

#pp = pprint.PrettyPrinter(indent=4)

#pp.pprint(X)






Esempio n. 17
0
def test_hinton_plot_categorical():
    (R,P,Z) = _setup_bernoulli_mixture()
    bpplt.hinton(Z)
Esempio n. 18
0
import numpy
numpy.random.seed(1)
p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9]
p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9]
p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1]
import numpy as np
p = np.array([p0, p1, p2])
from bayespy.utils import random
z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100)
x = random.bernoulli(p[z])
N = 100
D = 10
K = 10
from bayespy.nodes import Categorical, Dirichlet
R = Dirichlet(K * [1e-5], name='R')
Z = Categorical(R, plates=(N, 1), name='Z')
from bayespy.nodes import Beta
P = Beta([0.5, 0.5], plates=(D, K), name='P')
from bayespy.nodes import Mixture, Bernoulli
X = Mixture(Z, Bernoulli, P)
from bayespy.inference import VB
Q = VB(Z, R, X, P)
P.initialize_from_random()
X.observe(x)
Q.update(repeat=1000)
import bayespy.plot as bpplt
bpplt.hinton(P)
bpplt.pyplot.show()
Esempio n. 19
0
def run(n_documents=30,
        n_topics=5,
        n_vocabulary=10,
        n_words=50000,
        stochastic=False,
        maxiter=1000,
        seed=None):

    if seed is not None:
        np.random.seed(seed)

    (corpus, word_documents) = generate_data(n_documents, n_topics,
                                             n_vocabulary, n_words)

    if not stochastic:

        Q = model(n_documents=n_documents,
                  n_topics=n_topics,
                  n_vocabulary=n_vocabulary,
                  corpus=corpus,
                  word_documents=word_documents)

        Q.update(repeat=maxiter)

    else:

        subset_size = 1000

        Q = model(n_documents=n_documents,
                  n_topics=n_topics,
                  n_vocabulary=n_vocabulary,
                  corpus=corpus[:subset_size],
                  word_documents=word_documents[:subset_size],
                  plates_multiplier=n_words / subset_size)

        Q.ignore_bound_checks = True
        delay = 1
        forgetting_rate = 0.7
        for n in range(maxiter):

            # Observe a mini-batch
            subset = np.random.choice(n_words, subset_size)
            Q['words'].observe(corpus[subset])
            Q['word_documents'].set_value(word_documents[subset])

            # Learn intermediate variables
            Q.update('topics')

            # Set step length
            step = (n + delay)**(-forgetting_rate)

            # Stochastic gradient for the global variables
            Q.gradient_step('p_topic', 'p_word', scale=step)

        bpplt.pyplot.figure()
        bpplt.pyplot.plot(Q.L)

    bpplt.pyplot.figure()
    bpplt.hinton(Q['p_topic'])
    bpplt.pyplot.title("Posterior topic distribution for each document")
    bpplt.pyplot.xlabel("Topics")
    bpplt.pyplot.ylabel("Documents")

    bpplt.pyplot.figure()
    bpplt.hinton(Q['p_word'])
    bpplt.pyplot.title("Posterior word distributions for each topic")
    bpplt.pyplot.xlabel("Words")
    bpplt.pyplot.ylabel("Topics")

    return