def generate_data(n_documents, n_topics, n_vocabulary, n_words): # Generate random data from the generative model # Generate document assignments for the words word_documents = nodes.Categorical(np.ones(n_documents) / n_documents, plates=(n_words, )).random() # Topic distribution for each document p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics), plates=(n_documents, )).random() # Word distribution for each topic p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary), plates=(n_topics, )).random() # Topic for each word in each document topic = nodes.Categorical(p_topic[word_documents], plates=(n_words, )).random() # Each word in each document corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random() bpplt.pyplot.figure() bpplt.hinton(p_topic) bpplt.pyplot.title("True topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(p_word) bpplt.pyplot.title("True word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return (corpus, word_documents)
def run(M=30, D=5): # Generate data y = np.random.randint(D, size=(M, )) # Construct model p = nodes.Dirichlet(1 * np.ones(D), name='p') z = nodes.Categorical(p, plates=(M, ), name='z') # Observe the data with randomly missing values mask = random.mask(M, p=0.5) z.observe(y, mask=mask) # Run VB-EM Q = VB(p, z) Q.update() # Show results z.show() p.show()
print("++++++++++++++++++++++++++") N = 10000 y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1]) a0 = [0.5, 0.1, 0.1] mu0 = -1 lambda0 = 5 #MU = bayes.Gaussian(mu=mu0, Lambda=0.9) #X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, )) P = bayes.Dirichlet(a0) X = bayes.Categorical(P, plates=(N, )) #P.initialize_from_random() Q = VB(X, P) X.observe(y) Q.update(repeat=1000) print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) print(P.random()) #print(np.sum(y==2))
def create_model(self, model_type=None): #Create location model for each of the timezone location_model = [] if ('all' == model_type): p_conc = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc.initialize_from_value(np.ones(self.N_LOCATIONS)) p_theta = nodes.Dirichlet(p_conc, plates = (self.N_TIMEZONES,), name = 'p_theta') for time in np.arange(self.N_TIMEZONES): model = nodes.Categorical(p_theta[time], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta, p_conc) elif ('cross' == model_type): raise 'Not Implemented' pass elif ('2fold' == model_type): p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS)) p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS)) morning_time = np.arange(6,19) night_time = np.append(np.arange(0,6) , np.arange(19,24)) p_theta_morning = nodes.Dirichlet(p_conc_morning, plates = (morning_time.size,), name = 'p_theta_morning') p_theta_night = nodes.Dirichlet(p_conc_night, plates = (night_time.size,), name = 'p_theta_night') #Combinging morning time for count, time in enumerate(morning_time): model = nodes.Categorical(p_theta_morning[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] #print(timezone_observations) if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) #Combinging night time for count, time in enumerate(night_time): model = nodes.Categorical(p_theta_night[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta_morning, p_theta_night, p_conc_morning, p_conc_night) else: raise 'no model_type selected' print ("models created") #################################################################################### #Learning parameters Q.update(repeat=1000) print ('learned params') #################################################################################### if ('all' == model_type): return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS)) elif ('2fold' == model_type): learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS)) learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS)) return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
def model(n_documents, n_topics, n_vocabulary, corpus, word_documents, plates_multiplier=1): ''' Construct Latent Dirichlet Allocation model. Parameters ---------- documents : int The number of documents topics : int The number of topics vocabulary : int The number of words in the vocabulary corpus : integer array The vocabulary index of each word in the corpus word_documents : integer array The document index of each word in the corpus ''' # Topic distributions for each document p_topic = nodes.Dirichlet(np.ones(n_topics), plates=(n_documents, ), name='p_topic') # Word distributions for each topic p_word = nodes.Dirichlet(np.ones(n_vocabulary), plates=(n_topics, ), name='p_word') # Use a simple wrapper node so that the value of this can be changed if one # uses stocahstic variational inference word_documents = Constant(CategoricalMoments(n_documents), word_documents, name='word_documents') # Choose a topic for each word in the corpus topics = nodes.Categorical(nodes.Gate(word_documents, p_topic), plates=(len(corpus), ), plates_multiplier=(plates_multiplier, ), name='topics') # Choose each word in the corpus from the vocabulary words = nodes.Categorical(nodes.Gate(topics, p_word), name='words') # Observe the corpus words.observe(corpus) # Break symmetry by random initialization p_topic.initialize_from_random() p_word.initialize_from_random() return VB(words, topics, p_word, p_topic, word_documents)
# print() # print(subsets[subset]) Q['X'].observe([y[inx] for inx in subset]) # Learn intermediate variables Q.update('Z') # Set step length step = (iter + delay) ** (-forgetting_rate) # Stochastic gradient for the global variables Q.gradient_step('p', 'T', 'E', scale=step) ''' likelihood = Q['E'].random() qp = p.random() qT = T.random() qE = E.random() #print(qT) #print(qE) d = bayes.Dirichlet([0.3, 0.7]) n = bayes.Categorical(d) print(n.parents[0]) print(n.parents[0].get_moments()) f = n.parents[0].get_moments()[0] print(np.exp(f)) print(n) print(n.pdf([0, 1])) print(E)