def generate_data(n_documents, n_topics, n_vocabulary, n_words): # Generate random data from the generative model # Generate document assignments for the words word_documents = nodes.Categorical(np.ones(n_documents) / n_documents, plates=(n_words, )).random() # Topic distribution for each document p_topic = nodes.Dirichlet(1e-1 * np.ones(n_topics), plates=(n_documents, )).random() # Word distribution for each topic p_word = nodes.Dirichlet(1e-1 * np.ones(n_vocabulary), plates=(n_topics, )).random() # Topic for each word in each document topic = nodes.Categorical(p_topic[word_documents], plates=(n_words, )).random() # Each word in each document corpus = nodes.Categorical(p_word[topic], plates=(n_words, )).random() bpplt.pyplot.figure() bpplt.hinton(p_topic) bpplt.pyplot.title("True topic distribution for each document") bpplt.pyplot.xlabel("Topics") bpplt.pyplot.ylabel("Documents") bpplt.pyplot.figure() bpplt.hinton(p_word) bpplt.pyplot.title("True word distributions for each topic") bpplt.pyplot.xlabel("Words") bpplt.pyplot.ylabel("Topics") return (corpus, word_documents)
def run(M=30, D=5): # Generate data y = np.random.randint(D, size=(M, )) # Construct model p = nodes.Dirichlet(1 * np.ones(D), name='p') z = nodes.Categorical(p, plates=(M, ), name='z') # Observe the data with randomly missing values mask = random.mask(M, p=0.5) z.observe(y, mask=mask) # Run VB-EM Q = VB(p, z) Q.update() # Show results z.show() p.show()
print("++++++++++++++++++++++++++") N = 10000 y = np.random.choice(3, size=N, p=[0.3, 0.6, 0.1]) a0 = [0.5, 0.1, 0.1] mu0 = -1 lambda0 = 5 #MU = bayes.Gaussian(mu=mu0, Lambda=0.9) #X = bayes.Gaussian(mu=0.2, Lambda=0.4, plates=(N, )) P = bayes.Dirichlet(a0) X = bayes.Categorical(P, plates=(N, )) #P.initialize_from_random() Q = VB(X, P) X.observe(y) Q.update(repeat=1000) print(X.pdf([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) print(P.random()) #print(np.sum(y==2))
def create_model(self, model_type=None): #Create location model for each of the timezone location_model = [] if ('all' == model_type): p_conc = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc.initialize_from_value(np.ones(self.N_LOCATIONS)) p_theta = nodes.Dirichlet(p_conc, plates = (self.N_TIMEZONES,), name = 'p_theta') for time in np.arange(self.N_TIMEZONES): model = nodes.Categorical(p_theta[time], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta, p_conc) elif ('cross' == model_type): raise 'Not Implemented' pass elif ('2fold' == model_type): p_conc_morning = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_night = nodes.DirichletConcentration(self.N_LOCATIONS) p_conc_morning.initialize_from_value(np.ones(self.N_LOCATIONS)) p_conc_night.initialize_from_value(np.ones(self.N_LOCATIONS)) morning_time = np.arange(6,19) night_time = np.append(np.arange(0,6) , np.arange(19,24)) p_theta_morning = nodes.Dirichlet(p_conc_morning, plates = (morning_time.size,), name = 'p_theta_morning') p_theta_night = nodes.Dirichlet(p_conc_night, plates = (night_time.size,), name = 'p_theta_night') #Combinging morning time for count, time in enumerate(morning_time): model = nodes.Categorical(p_theta_morning[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] #print(timezone_observations) if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) #Combinging night time for count, time in enumerate(night_time): model = nodes.Categorical(p_theta_night[count], plates=(self.N_OBSERVATIONS[time],1), name=str(time)) #observe data timezone_observations = self._observed_locations[self._observed_locations['time'] == time] if not timezone_observations.empty: data = timezone_observations['location'].as_matrix().reshape((self.N_OBSERVATIONS[time],1)) model.observe(data) location_model.append(model) Q = VB(location_model[0], location_model[1], location_model[2], location_model[3], location_model[4], location_model[5], location_model[6], location_model[7], location_model[8], location_model[9], location_model[10], location_model[11], location_model[12], location_model[13], location_model[14], location_model[15], location_model[16], location_model[17], location_model[18], location_model[19], location_model[20], location_model[21], location_model[22], location_model[23], p_theta_morning, p_theta_night, p_conc_morning, p_conc_night) else: raise 'no model_type selected' print ("models created") #################################################################################### #Learning parameters Q.update(repeat=1000) print ('learned params') #################################################################################### if ('all' == model_type): return np.array(p_theta.get_parameters()).reshape((self.N_TIMEZONES,self.N_LOCATIONS)) elif ('2fold' == model_type): learned_night = np.array(p_theta_night.get_parameters()).reshape((night_time.size, self.N_LOCATIONS)) learned_morn = np.array(p_theta_morning.get_parameters()).reshape((morning_time.size, self.N_LOCATIONS)) return(np.row_stack((learned_night[:6,:], learned_morn, learned_night[6:,:])))
def get_node_distr_over_comm(g, walks, method=None, params={}): if method == "HMM_param": seqs = [] lens = [] for walk in walks: s = [[int(w)] for w in walk] seqs.extend(s) lens.append(len(s)) model = hmm.MultinomialHMM(n_components=params['number_of_topics'], tol=0.001, n_iter=5000) model.fit(seqs, lens) #posteriors = model.predict_proba(np.asarray([[i] for i in range(self.g.number_of_nodes())])) #comms = np.argmax(posteriors, 1) likelihood = model.emissionprob_ """ comms = np.argmax(likelihood, 0) node2comm = {} for id in range(len(comms)): node2comm[str(id)] = comms[id] return node2comm """ elif method == "Nonparam_HMM": seqs = [] lens = [] for walk in walks: s = [int(w) for w in walk] seqs.append(s) lens.append(len(s)) seqs = np.vstack(seqs) K = params['number_of_topics'] # the number of hidden states O = g.number_of_nodes() # the size of observation set L = len(seqs[0]) # the length of each sequence N = len(seqs) # the number of sequences p0 = params['prior_p0'] # a vector of size K t0 = params['prior_t0'] # a vector of size K e0 = params['prior_e0'] # a vector of size K p = bayes.Dirichlet(p0 * np.ones(K), name='p') T = bayes.Dirichlet(t0 * np.ones(K), plates=(K, ), name='T') E = bayes.Dirichlet(e0 * np.ones(O), plates=(K, ), name='E') Z = bayes.CategoricalMarkovChain(p, T, states=L, name='Z', plates=(N, )) # Emission/observation distribution X = bayes.Mixture(Z, bayes.Categorical, E, name='X') p.initialize_from_random() T.initialize_from_random() E.initialize_from_random() Q = VB(X, Z, p, T, E) Q['X'].observe(seqs) Q.update(repeat=1000) likelihood = Q['E'].random() """ comms = np.argmax(likelihood, 0) node2comm = {} for id in range(len(comms)): node2comm[str(id)] = comms[id] return node2comm """ return likelihood elif method == "LDA": # Run GibbsLDA++ if not os.path.exists(GIBBSLDA_PATH): raise ValueError("Invalid path of GibbsLDA++!") temp_lda_folder = os.path.join(TEMP_FOLDER, "lda_temp") if not os.path.exists(temp_lda_folder): os.makedirs(temp_lda_folder) temp_dfile_path = os.path.join(temp_lda_folder, "gibblda_temp.dfile") # Save the walks into the dfile n = len(walks) with open(temp_dfile_path, 'w') as f: f.write("{}\n".format(n)) for walk in walks: f.write("{}\n".format(" ".join(str(w) for w in walk))) initial_time = time.time() cmd = "{} -est ".format(GIBBSLDA_PATH) cmd += "-alpha {} ".format(params['lda_alpha']) cmd += "-beta {} ".format(params['lda_beta']) cmd += "-ntopics {} ".format(params['number_of_topics']) cmd += "-niters {} ".format(params['lda_number_of_iters']) cmd += "-savestep {} ".format(params['lda_number_of_iters'] + 1) cmd += "-dfile {} ".format(temp_dfile_path) os.system(cmd) print("-> The LDA algorithm run in {:.2f} secs".format(time.time() - initial_time)) # Read wordmap file id2node = {} temp_wordmap_path = os.path.join(temp_lda_folder, "wordmap.txt") with open(temp_wordmap_path, 'r') as f: f.readline() # skip the first line for line in f.readlines(): tokens = line.strip().split() id2node[int(tokens[1])] = tokens[0] # Read phi file num_of_nodes = len(id2node) phi = np.zeros(shape=(params['number_of_topics'], num_of_nodes), dtype=np.float) temp_phi_path = os.path.join(temp_lda_folder, "model-final.phi") with open(temp_phi_path, 'r') as f: for comm, line in enumerate(f.readlines()): for id, value in enumerate(line.strip().split()): phi[comm, int(id2node[id])] = value # Read the tassign file, generate topic corpus temp_tassing_path = os.path.join(temp_lda_folder, "model-final.tassign") comm_corpus = [] with smart_open(temp_tassing_path, 'r') as f: for line in f: tokens = line.strip().split() comm_corpus.append([token.split(':')[1] for token in tokens]) """ max_topics = np.argmax(phi, axis=0) node2comm = {} for nodeId in id2node: node2comm[id2node[nodeId]] = max_topics[int(nodeId)] return node2comm """ return phi, comm_corpus else: raise ValueError("Wrong parameter name!")
def model(n_documents, n_topics, n_vocabulary, corpus, word_documents, plates_multiplier=1): ''' Construct Latent Dirichlet Allocation model. Parameters ---------- documents : int The number of documents topics : int The number of topics vocabulary : int The number of words in the vocabulary corpus : integer array The vocabulary index of each word in the corpus word_documents : integer array The document index of each word in the corpus ''' # Topic distributions for each document p_topic = nodes.Dirichlet(np.ones(n_topics), plates=(n_documents, ), name='p_topic') # Word distributions for each topic p_word = nodes.Dirichlet(np.ones(n_vocabulary), plates=(n_topics, ), name='p_word') # Use a simple wrapper node so that the value of this can be changed if one # uses stocahstic variational inference word_documents = Constant(CategoricalMoments(n_documents), word_documents, name='word_documents') # Choose a topic for each word in the corpus topics = nodes.Categorical(nodes.Gate(word_documents, p_topic), plates=(len(corpus), ), plates_multiplier=(plates_multiplier, ), name='topics') # Choose each word in the corpus from the vocabulary words = nodes.Categorical(nodes.Gate(topics, p_word), name='words') # Observe the corpus words.observe(corpus) # Break symmetry by random initialization p_topic.initialize_from_random() p_word.initialize_from_random() return VB(words, topics, p_word, p_topic, word_documents)
for n in range(N): s = np.random.choice(a=K, size=1)[0] for l in range(L): o = np.random.choice(a=E, size=1, p=O[s, :])[0] s = np.random.choice(a=K, size=1, p=A[s, :])[0] y[n].append(o) #L = len(y) p0 = 0.3 # a vector of size K t0 = 0.2 # a vector of size K e0 = 0.1 p_param = p0 * np.ones(K, dtype=np.float) p = bayes.Dirichlet(p_param, name='p') t_param = t0 * np.ones(K, dtype=np.float) T = bayes.Dirichlet(t_param, plates=(K, ), name='T') e_param = e0 * np.ones(E, dtype=np.float) E = bayes.Dirichlet(e_param, plates=(K, ), name='E') z = bayes.CategoricalMarkovChain(p, T, states=L, plates=(N, ), name='Z') x = bayes.Mixture(z, bayes.Categorical, E, plates=(N, L), name='X') p.initialize_from_random() T.initialize_from_random() E.initialize_from_random() Q = VB(x, z, E, T, p)