def _test_lda_importance_sampling(self, testing_df, fold_idx, is_num_samples, is_iters, training_gibbs, use_posterior_alpha=True): print "Run testing importance sampling " + str(testing_df.shape) topics = training_gibbs.topic_word_ if use_posterior_alpha: # use posterior alpha as the topic prior during importance sampling topic_prior = training_gibbs.posterior_alpha[:, None] else: # use prior alpha as the topic prior during importance sampling topic_prior = np.ones((self.K, 1)) topic_prior = topic_prior / np.sum(topic_prior) topic_prior = topic_prior * self.K * self.alpha print 'topic_prior = ' + str(topic_prior) marg = 0 n_words = 0 for d in range(testing_df.shape[0]): document = self.df.iloc[[d]] words = utils.word_indices(document) doc_marg = ldae_is_variants(words, topics, topic_prior, num_samples=is_num_samples, variant=3, variant_iters=is_iters) print "\td = " + str(d) + " doc_marg=" + str(doc_marg) sys.stdout.flush() marg += doc_marg n_words += len(words) perp = np.exp(-(marg/n_words)) print "lda is testing log evidence fold " + str(fold_idx) + " = " + str(marg) print "lda is testing perplexity fold " + str(fold_idx) + " = " + str(perp) print return marg, perp
def get_motif_contributions(self, parent_peak_id): # work out the contributions of different M2Ms row_idx = self.ms1["peakID"] == parent_peak_id pos = np.nonzero(row_idx.values)[0] d = np.asscalar(pos) motifs_of_interest = np.nonzero(self.doc_topic[d])[0].tolist() document = self.df.iloc[[d]] word_idx = utils.word_indices(document) results = {} for pos in range(len(word_idx)): n = word_idx[pos] k = self.model.Z[(d, pos)] # IMPORTANT: consider only the validated M2M, but a word might be generated by # other M2M not in our list!! if k in motifs_of_interest: word = self.vocab[n] if word in results: results[word].append(k) else: results[word] = [k] contributions = {} for word in results: topics = Counter(results[word]) total = float(np.sum(topics.values())) ratio = {key: (topics[key] / total) for key in topics} contributions[word] = ratio return contributions
def _test_lda_importance_sampling(self, testing_df, fold_idx, is_num_samples, is_iters, training_gibbs, use_posterior_alpha=True): print "Run testing importance sampling " + str(testing_df.shape) topics = training_gibbs.topic_word_ if use_posterior_alpha: # use posterior alpha as the topic prior during importance sampling topic_prior = training_gibbs.posterior_alpha[:, None] else: # use prior alpha as the topic prior during importance sampling topic_prior = np.ones((self.K, 1)) topic_prior = topic_prior / np.sum(topic_prior) topic_prior = topic_prior * self.K * self.alpha print 'topic_prior = ' + str(topic_prior) marg = 0 n_words = 0 for d in range(testing_df.shape[0]): document = self.df.iloc[[d]] words = utils.word_indices(document) doc_marg = ldae_is_variants(words, topics, topic_prior, num_samples=is_num_samples, variant=3, variant_iters=is_iters) print "\td = " + str(d) + " doc_marg=" + str(doc_marg) sys.stdout.flush() marg += doc_marg n_words += len(words) perp = np.exp(-(marg / n_words)) print "lda is testing log evidence fold " + str( fold_idx) + " = " + str(marg) print "lda is testing perplexity fold " + str(fold_idx) + " = " + str( perp) print return marg, perp
def __init__(self, df, vocab, K, alpha, beta, random_state=None, previous_model=None, sparse=False): """ Initialises the collapsed Gibbs sampling for LDA Arguments: - df: the dataframe of counts of vocabularies x documents - K: no. of topics - alpha: symmetric prior on document-topic assignment - beta: symmetric prior on word-topic assignment - previous_model: previous LDA run, if any """ print "CGS LDA initialising" self.sparse = sparse if not self.sparse: self.df = df.replace(np.nan, 0) else: self.df = df self.alpha = alpha self.beta = beta self.D = df.shape[0] # total no of docs self.N = df.shape[1] # total no of words self.vocab = vocab assert(len(self.vocab)==self.N) # set total no of topics self.cv = False self.previous_model = previous_model if self.previous_model is not None: # if some old topics were fixed if hasattr(self.previous_model, 'selected_topics'): # no. of new topics self.K = K # no. of previously selected topics self.previous_K = len(self.previous_model.selected_topics) # Get the previous ckn and ck values from the training stage. # During gibbs update in this testing stage, assignment of word # to the first previous_K topics will use the previous fixed # topic-word distributions -- as specified by previous_ckn and previous_ck self.previous_ckn = self.previous_model.selected_ckn self.previous_ck = self.previous_model.selected_ck self.previous_vocab = self.previous_model.selected_vocab assert(len(self.previous_ck)==self.previous_K) assert(self.previous_ckn.shape[0]==len(self.previous_ck)) assert(self.previous_ckn.shape[1]==len(self.previous_vocab)) # make previous_ckn have the right number of columns N_prev_words = len(self.previous_vocab) N_diff = self.N - N_prev_words temp = np.zeros((self.previous_K, N_diff), int32) self.previous_ckn = np.hstack((self.previous_ckn, temp)) # size is previous_K x N # make previous_ckn have the right number of rows temp = np.zeros((self.K, self.N), int32) self.previous_ckn = np.vstack((self.previous_ckn, temp)) # size is (previous_K+K) x N # make previous_ck have the right length temp = np.zeros(self.K, int32) self.previous_ck = np.hstack((self.previous_ck, temp)) # length is (previous_K+K) # total no. of topics = old + new topics self.K = self.K + self.previous_K print "Total no. of topics = " + str(self.K) # set the first previous-K elements in alpha to the previous value self.alpha = np.ones(self.K) * alpha for k in range(self.previous_K): self.alpha[k] = self.previous_model.selected_alpha[k] # set previous beta for the words that have been fixed from before self.beta = np.ones(self.N) * beta for n in range(N_prev_words): self.beta[n] = self.previous_model.selected_beta[n] else: # otherwise all previous topics were fixed, for cross-validation self.cv = True self.K = K self.previous_ckn = self.previous_model.ckn self.previous_ck = self.previous_model.ck self.previous_K = K self.alpha = np.ones(self.K) * alpha self.beta = np.ones(self.N) * beta else: # for training stage self.K = K self.previous_ckn = np.zeros((self.K, self.N), int32) self.previous_ck = np.zeros(self.K, int32) self.previous_K = 0 # no old topics self.alpha = np.ones(self.K) * alpha self.beta = np.ones(self.N) * beta # make the current arrays too self.ckn = np.zeros((self.K, self.N), int32) self.ck = np.zeros(self.K, int32) self.cdk = np.zeros((self.D, self.K), int32) self.cd = np.zeros(self.D, int32) # make sure to get the same results from running gibbs each time if random_state is None: self.random_state = RandomState(1234567890) else: self.random_state = random_state # randomly assign words to topics self.Z = {} for d in range(self.D): if d%10==0: sys.stdout.write('.') sys.stdout.flush() document = self.df[d, :] if self.sparse else self.df.iloc[[d]] word_idx = utils.word_indices(document, sparse=sparse) for pos, n in enumerate(word_idx): k = self.random_state.randint(self.K) self.cdk[d, k] += 1 self.cd[d] += 1 self.ckn[k, n] += 1 self.ck[k] += 1 self.Z[(d, pos)] = k print # turn word counts in the document into a vector of word occurences self.document_indices = {} for d in range(self.D): document = self.df[d, :] if self.sparse else self.df.iloc[[d]] word_idx = utils.word_indices(document, sparse=self.sparse) word_locs = [] for pos, n in enumerate(word_idx): word_locs.append((pos, n)) self.document_indices[d] = word_locs self.samples = [] # store the samples