def run(n_documents=30, n_topics=5, n_vocabulary=10, n_words=50000, stochastic=False, maxiter=1000, seed=None): if seed is not None: np.random.seed(seed) (corpus, word_documents) = generate_data(n_documents, n_topics, n_vocabulary, n_words) if not stochastic: Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary, corpus=corpus, word_documents=word_documents) Q.update(repeat=maxiter) else: subset_size = 1000 Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary, corpus=corpus[:subset_size], word_documents=word_documents[:subset_size], plates_multiplier=n_words/subset_size) Q.ignore_bound_checks = True delay = 1 forgetting_rate = 0.7 for n in range(maxiter): # Observe a mini-batch subset = np.random.choice(n_words, subset_size) Q['words'].observe(corpus[subset]) Q['word_documents'].set_value(word_documents[subset]) # Learn intermediate variables Q.update('topics') # Set step length step = (n + delay) ** (-forgetting_rate) # Stochastic gradient for the global variables Q.gradient_step('p_topic', 'p_word', scale=step) bpplt.pyplot.figure() bpplt.pyplot.plot(Q.L) bpplt.pyplot.figure() bpplt.hinton(Q['p_topic']) bpplt.pyplot.title("Posterior topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(Q['p_word']) bpplt.pyplot.title("Posterior word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return
def _run(self, x, K=25, beta=0.5, alpha=0.00001, hinton_plot=False, end=False): '''Only to be used when doing parameter optimization.''' self.participant_list = x[0] N = len(x[0]) #number of data points (i.e. WCS participants) D = np.shape(x[1])[1] #number of features #K = 20 #number of initial clusters R = Dirichlet(K*[alpha], name='R') Z = Categorical(R, plates=(N,1), name='Z') P = Beta([beta, beta], plates=(D,K), name='P') X = Mixture(Z, Bernoulli, P) Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x[1]) Q.update(repeat=1000) log_likelihood = Q.L[Q.iter-1] if hinton_plot: bpplt.hinton(Z) bpplt.pyplot.show() bpplt.hinton(R) bpplt.pyplot.show() #Get the weight matrix stored in Z (weights determine which cluster data point belongs to) z = Z._message_to_child()[0] z = z * np.ones(Z.plates+(1,)) z = np.squeeze(z) self.z = z #Get the weights stored in R (proportional to the size of the clusters) r = np.exp(R._message_to_child()[0]) r = r * np.ones(R.plates+(1,)) r = np.squeeze(r) self.r = r #Get the cluster assignment of each data point self.c_assign = np.argmax(self.z, axis=1) return log_likelihood
def generate_data(n_documents, n_topics, n_vocabulary, n_words): # Generate random data from the generative model # Generate document assignments for the words word_documents = nodes.Categorical(np.ones(n_documents)/n_documents, plates=(n_words,)).random() # Topic distribution for each document p_topic = nodes.Dirichlet(1e-1*np.ones(n_topics), plates=(n_documents,)).random() # Word distribution for each topic p_word = nodes.Dirichlet(1e-1*np.ones(n_vocabulary), plates=(n_topics,)).random() # Topic for each word in each document topic = nodes.Categorical(p_topic[word_documents], plates=(n_words,)).random() # Each word in each document corpus = nodes.Categorical(p_word[topic], plates=(n_words,)).random() bpplt.pyplot.figure() bpplt.hinton(p_topic) bpplt.pyplot.title("True topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(p_word) bpplt.pyplot.title("True word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return (corpus, word_documents)
def generate_data(n_documents, n_topics, n_vocabulary, n_words): # Generate random data from the generative model # Generate document assignments for the words word_documents = nodes.Categorical(np.ones(n_documents) / n_documents, plates=(n_words, )).random() # Topic distribution for each document p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics), plates=(n_documents, )).random() # Word distribution for each topic p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary), plates=(n_topics, )).random() # Topic for each word in each document topic = nodes.Categorical(p_topic[word_documents], plates=(n_words, )).random() # Each word in each document corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random() bpplt.pyplot.figure() bpplt.hinton(p_topic) bpplt.pyplot.title("True topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(p_word) bpplt.pyplot.title("True word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return (corpus, word_documents)
import numpy as np mu = np.array([[0, 0], [3, 4], [6, 0]]) std = 2.0 K = 3 N = 200 p0 = np.ones(K) / K q = 0.9 r = (1 - q) / (K - 1) P = q * np.identity(K) + r * (np.ones((3, 3)) - np.identity(3)) y = np.zeros((N, 2)) z = np.zeros(N) state = np.random.choice(K, p=p0) for n in range(N): z[n] = state y[n, :] = std * np.random.randn(2) + mu[state] state = np.random.choice(K, p=P[state]) from bayespy.nodes import Dirichlet a0 = Dirichlet(1e-3 * np.ones(K)) A = Dirichlet(1e-3 * np.ones((K, K))) Z = CategoricalMarkovChain(a0, A, states=N) Lambda = std**(-2) * np.identity(2) from bayespy.nodes import Gaussian Y = Mixture(Z, Gaussian, mu, Lambda) Y.observe(y) Q = VB(Y, Z, A, a0) Q.update(repeat=1000) bpplt.hinton(A) bpplt.pyplot.show()
def run(self, K=25, beta=0.5, alpha=0.00001, foci_thresh=0, num_neigh=4, hinton_plot=False, end=False): '''Performs one run of the BBDP according to the specified parameters.''' print("Transforming WCS participant data into binary vectors...") x = u.transform_data_all(self.langs, norm=False, end=end, foci=True, foci_thresh=foci_thresh, num_neigh=num_neigh) print("Finished transforming participant data") self.participant_list = x[0] N = len(x[0]) #number of data points (i.e. WCS participants) D = np.shape(x[1])[1] #number of features #K = 20 #number of initial clusters R = Dirichlet(K*[alpha], name='R') Z = Categorical(R, plates=(N,1), name='Z') P = Beta([beta, beta], plates=(D,K), name='P') X = Mixture(Z, Bernoulli, P) Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x[1]) Q.update(repeat=1000) if hinton_plot: bpplt.hinton(Z) bpplt.pyplot.show() bpplt.hinton(R) bpplt.pyplot.show() #Get the weight matrix stored in Z (weights determine which cluster data point belongs to) z = Z._message_to_child()[0] z = z * np.ones(Z.plates+(1,)) z = np.squeeze(z) self.z = z #Get the weights stored in R (proportional to the size of the clusters) r = np.exp(R._message_to_child()[0]) r = r * np.ones(R.plates+(1,)) r = np.squeeze(r) self.r = r #Get the cluster assignment of each data point self.c_assign = np.argmax(self.z, axis=1) #Write cluster results to a file if self.write_to_file: if end: save_path = "cluster_results_end_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh) else: save_path = "cluster_results_K={}_B={}_a={}_t={}_nn={}".format(K, beta, alpha, foci_thresh, num_neigh) while path.exists(save_path+".txt"): #save_path already exists try: old_file_num = int(save_path[save_path.find('(')+1:-1]) new_file_num = old_file_num + 1 save_path = save_path[0:save_path.find('(')] + '(' + str(new_file_num) + ')' except ValueError: save_path = save_path + " (1)" self.save_path = save_path file = open(path.abspath(self.save_path+".txt"), 'w') #Write cluster assignment matrix Z (gives the probability that observation i belongs to cluster j) if 'Z' not in self.in_file: for i in range(len(self.z)): line = "\t".join([str(x) for x in self.z[i]]) + "\n" file.write(line) file.write('---Z\n') self.in_file.append('Z') #Write cluster weights matrix R (proportional to the size of the resulting clusters) if 'R' not in self.in_file: line = "\t".join([str(x) for x in self.r]) + "\n" file.write(line) file.write('---R\n') self.in_file.append('R') #Write deterministic cluster assignments with the corresponding participant key if 'C' not in self.in_file: line1 = "\t".join([str(x) for x in self.participant_list]) + "\n" line2 = "\t".join([str(x) for x in self.c_assign]) + "\n" file.write(line1) file.write(line2) file.write('---C\n') self.in_file.append('C') file.close() return self.c_assign
def test_hinton_plot_dirichlet(): (R,P,Z) = _setup_bernoulli_mixture() bpplt.hinton(R)
#( 5 : Parameter expansion 収束が遅い時) # from bayespy.inference.vmp import transformations # rotX = transformations.RotateGaussianARD(X) # rotC = transformations.RotateGaussianARD(C, alpha) # R = transformations.RotationOptimizer(rotC, rotX, D) # R.rotate() # alpha.initialize_from_prior() # C.initialize_from_prior() # X.initialize_from_parameters(np.random.randn(1, 100, D), 10) # tau.initialize_from_prior() # Q = VB(Y, C, X, alpha, tau) # Q.callback = R.rotate # Q.update(repeat=1000, tol=1e-6) # -----Examining the results----- # Plotting the results bpplt.pyplot.figure() bpplt.pdf(Q['tau'], np.linspace(60, 140, num=100)) V = Gaussian([3, 5], [[4, 2], [2, 5]]) bpplt.pyplot.figure() bpplt.contour(V, np.linspace(1, 5, num=100), np.linspace(3, 7, num=100)) bpplt.pyplot.figure() bpplt.hinton(C) bpplt.pyplot.figure() bpplt.plot(X, axis=-2) bpplt.pyplot.show()
import numpy numpy.random.seed(1) p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9] p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9] p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1] import numpy as np p = np.array([p0, p1, p2]) from bayespy.utils import random z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100) x = random.bernoulli(p[z]) N = 100 D = 10 K = 10 from bayespy.nodes import Categorical, Dirichlet R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') from bayespy.nodes import Beta P = Beta([0.5, 0.5], plates=(D, K), name='P') from bayespy.nodes import Mixture, Bernoulli X = Mixture(Z, Bernoulli, P) from bayespy.inference import VB Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) import bayespy.plot as bpplt bpplt.hinton(R) bpplt.pyplot.show()
import numpy numpy.random.seed(1) p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9] p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9] p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1] import numpy as np p = np.array([p0, p1, p2]) from bayespy.utils import random z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100) x = random.bernoulli(p[z]) N = 100 D = 10 K = 10 from bayespy.nodes import Categorical, Dirichlet R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') from bayespy.nodes import Beta P = Beta([0.5, 0.5], plates=(D, K), name='P') from bayespy.nodes import Mixture, Bernoulli X = Mixture(Z, Bernoulli, P) from bayespy.inference import VB Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) import bayespy.plot as bpplt bpplt.hinton(Z) bpplt.pyplot.show()
def test_hinton_plot_beta(): (R, P, Z) = _setup_bernoulli_mixture() bpplt.hinton(P)
def test_hinton_plot_dirichlet(): (R, P, Z) = _setup_bernoulli_mixture() bpplt.hinton(R)
def test_hinton_plot_categorical(): (R, P, Z) = _setup_bernoulli_mixture() bpplt.hinton(Z)
N = 100 D = 10 K = 3 from bayespy.nodes import Categorical, Dirichlet R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') from bayespy.nodes import Beta P = Beta([0.5, 0.5], plates=(D, K), name='P') from bayespy.nodes import Mixture, Bernoulli X = Mixture(Z, Bernoulli, P) from bayespy.inference import VB Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) import bayespy.plot as bpplt plt.figure() bpplt.hinton(R) plt.figure() bpplt.hinton(P) plt.figure() bpplt.hinton(p) plt.figure() bpplt.hinton(Z)
def test_hinton_plot_beta(): (R,P,Z) = _setup_bernoulli_mixture() bpplt.hinton(P)
Q.update(repeat=1000) #print(" P:") #print( P.get_moments() ) #print(" R:") #print( R.get_moments() ) print(" Z:") print( Z.get_moments() ) print(" X:") print( X.get_moments() ) bpplt.hinton(R) #bpplt.hinton(P) #bpplt.hinton(Z) bpplt.pyplot.show() #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(X)
def test_hinton_plot_categorical(): (R,P,Z) = _setup_bernoulli_mixture() bpplt.hinton(Z)
import numpy numpy.random.seed(1) p0 = [0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9] p1 = [0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9] p2 = [0.9, 0.9, 0.9, 0.9, 0.9, 0.1, 0.1, 0.1, 0.1, 0.1] import numpy as np p = np.array([p0, p1, p2]) from bayespy.utils import random z = random.categorical([1 / 3, 1 / 3, 1 / 3], size=100) x = random.bernoulli(p[z]) N = 100 D = 10 K = 10 from bayespy.nodes import Categorical, Dirichlet R = Dirichlet(K * [1e-5], name='R') Z = Categorical(R, plates=(N, 1), name='Z') from bayespy.nodes import Beta P = Beta([0.5, 0.5], plates=(D, K), name='P') from bayespy.nodes import Mixture, Bernoulli X = Mixture(Z, Bernoulli, P) from bayespy.inference import VB Q = VB(Z, R, X, P) P.initialize_from_random() X.observe(x) Q.update(repeat=1000) import bayespy.plot as bpplt bpplt.hinton(P) bpplt.pyplot.show()
def run(n_documents=30, n_topics=5, n_vocabulary=10, n_words=50000, stochastic=False, maxiter=1000, seed=None): if seed is not None: np.random.seed(seed) (corpus, word_documents) = generate_data(n_documents, n_topics, n_vocabulary, n_words) if not stochastic: Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary, corpus=corpus, word_documents=word_documents) Q.update(repeat=maxiter) else: subset_size = 1000 Q = model(n_documents=n_documents, n_topics=n_topics, n_vocabulary=n_vocabulary, corpus=corpus[:subset_size], word_documents=word_documents[:subset_size], plates_multiplier=n_words / subset_size) Q.ignore_bound_checks = True delay = 1 forgetting_rate = 0.7 for n in range(maxiter): # Observe a mini-batch subset = np.random.choice(n_words, subset_size) Q['words'].observe(corpus[subset]) Q['word_documents'].set_value(word_documents[subset]) # Learn intermediate variables Q.update('topics') # Set step length step = (n + delay)**(-forgetting_rate) # Stochastic gradient for the global variables Q.gradient_step('p_topic', 'p_word', scale=step) bpplt.pyplot.figure() bpplt.pyplot.plot(Q.L) bpplt.pyplot.figure() bpplt.hinton(Q['p_topic']) bpplt.pyplot.title("Posterior topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(Q['p_word']) bpplt.pyplot.title("Posterior word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return