def get_perplexity(self): ''' Calculate perplexity value. Read in the model parameters got from above procedures Right now, held out documents have to be manually put in a seperate folder to read in. Returns: perplexity : currently, the only evaluation value, add others later ''' log_lhood = np.zeros(len(self.corpus)) e_theta = np.zeros((len(self.corpus),self._K)) total_words = 0 total_lhood = 0 # create held_out corpus heldout_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\heldout\\') path = 'd:\\mycode\\ctm_python\\state_union\\heldout\\' heldout_corpus = [] for thefile in heldout_filenames: with open(path + thefile, "rb") as f: strings = f.read() heldout_corpus.append(strings) (held_dictionary, held_corpus) = preprocess.get_dict_and_corp(heldout_corpus) total_words = len(held_dictionary) # load model parameters for calculating e_theta with open('corpus_lambda_dump', 'rb') as ctm_lambda_dump: lambda_v_c = cPickle.load(ctm_lambda_dump) with open('corpus_nu_dump', 'rb') as ctm_nu_dump: nu_v_c = cPickle.load(ctm_nu_dump) # calculate e_theta using observed data for d, doc in enumerate(self.corpus): obs_wordidsd = [id for id, _ in doc] obs_wordctsd = np.array([cnt for _, cnt in doc]) # obs_nterms = len(obs_wordidsd) lambda_v = lambda_v_c[d] nu_v = nu_v_c[d] # e_theta is calculated on each document # since obs_doc number is always no lesser than held_out doc number # so on the held out inference stage, e_theta won't indexed out e_theta[d] = self.expected_theta(obs_wordidsd, obs_wordctsd, lambda_v, nu_v) for d, doc in enumerate(held_corpus): # held_wordidsd = [id for id, _ in doc] held_wordctsd = np.array([cnt for _, cnt in doc]) # held_nterms = len(held_wordctsd) # approximate inference of held out data # randomly choose a number to index e_theta file # MEMO: this is dirty work, but right now, I can't think of a better way rand_etheta_index = np.random.randint(len(held_corpus)) etheta = e_theta[rand_etheta_index] log_lhood[d] = self.log_mult_prob(held_wordctsd, etheta) total_lhood = np.sum(log_lhood) perplexity = np.exp(- total_lhood / total_words) print 'the perplexity is:', perplexity
def __init__(self, K, mu=None, cov=None): ''' Arguments: K: Number of topics D: Total number of documents in the population. For a fixed corpus, this is the size of the corpus. mu and cov: the hyperparameters logistic normal distribution for prior on weight vectors theta ''' logger.info("CTM commence.") logger.info("Initializing...") if K is None is None: raise ValueError('number of topics have to be specified.') # get the folder name which containing all the training files # we will have to manually specific the observed and heldout folders obs_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\observed\\') path = 'd:\\mycode\\ctm_python\\state_union\\observed\\' logger.info("initializing id mapping from corpus, assuming identity") #initial a string to save all the file contents txt_corpus = [] for thefile in obs_filenames: with open(path + thefile, "rb") as f: strings = f.read() txt_corpus.append(strings) (dictionary, corpus) = preprocess.get_dict_and_corp(txt_corpus) logger.info("dictionary and corpus are generated") self.dictionary = dictionary self.corpus = corpus self._K = K # number of topics logger.info("There are %i topics.", self._K) self._W = len(dictionary) # number of all the terms self._D = len(corpus) # number of documents # initialize wordid and wordcount list for the whole corpus self.wordids = list() self.wordcts = list() for d, doc in enumerate(self.corpus): wordidsd = [id for id, _ in doc] wordctsd = np.array([cnt for _, cnt in doc]) self.wordids.append(wordidsd) self.wordcts.append(wordctsd) # mu : K-size vector with 0 as initial value # cov : K*K matrix with 1 as initial value , together they make a Gaussian if mu is None: self.mu = np.zeros(self._K) else: self.mu = mu if cov is None: self.cov = np.ones((self._K, self._K)) else: self.cov = cov # if cov is initialized by the process, then there is no need to calculate # inverse of cov, since it is singular matrix try: self.inv_cov = np.linalg.inv(self.cov) except np.linalg.linalg.LinAlgError as err: if 'Singular matrix' in err.message: self.inv_cov = self.cov else: pass # self.inv_cov = np.linalg.inv(self.cov) self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov)) self.ndata = 0 # cumulate count of number of docs processed # initialize topic distribution, i.e. self.log_beta sum = 0 self.beta = np.zeros([self._K, self._W]) self.log_beta = np.zeros([self._K, self._W]) for i in range(self._K): # initialize beta with a randomly chosen doc # stuff K topics randomly doc_no = np.random.randint(self._D) nterms = len(self.wordcts[doc_no]) for j in range(nterms): word_index = self.wordids[doc_no][j] self.log_beta[i][word_index] = self.wordcts[doc_no][j] # logging.info('self.log_beta[i,j]-track %f', self.log_beta[i,j]) for m in range(self._K): for n in range(self._W): self.log_beta[m][n] = self.log_beta[m][n] + 1.0 + np.random.ranf() # logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] ) # to initialize and smooth sum = math_utli.safe_log(np.sum(self.log_beta)) logger.info("log_beta_sum : %f", sum) # little function to normalize self.log_beta def element_add(x): return x + math_utli.safe_log(x-sum) self.log_beta = map(element_add, self.log_beta) for m in range(self._K): for n in range(self._W): logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] ) logger.info("initialization finished.")