Ejemplo n.º 1
0
	def maximization(self):
		'''
		M-step of EM algorithm, use scikit.learn's LedoitWolf method to perfom
		covariance matrix shrinkage.
		Arguments:
			sufficient statistics, i.e. model parameters
		Returns:
			the updated sufficient statistics which all in self definition, so no return values
		'''
		logger.info("running maximization function")
		logger.info("mean maximization")
		mu = np.divide(self.mu, self.ndata)
		logger.info("covariance maximization")
		for i in range(self._K):
			for j in range(self._K):
				self.cov[i, j] = (1.0 / self.ndata) * self.cov[i, j] + self.ndata * mu[i] * mu[j] - self.mu[i] * mu[j] - self.mu[j] * mu[i]
		logger.info(" performing covariance shrinkage using sklearn module")
		lw = LedoitWolf()
		cov_result = lw.fit(self.cov, assume_centered=True).covariance_
		self.inv_cov = np.linalg.inv(cov_result)
		self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov))

		logger.info("topic maximization")
		for i in range(self._K):
			sum_m = 0
			sum_m += np.sum(self.beta, axis=0)[i]

			if sum_m == 0:
				sum_m = -1000 * self._W
			else:
				sum_m = np.log(sum_m)

			for j in range(self._W):
				self.log_beta[i, j] = math_utli.safe_log(self.beta[i, j] - sum_m)

		logger.info("write model parameters to file")
		logger.info("write gaussian")
		with open('ctm_nu', 'w') as ctm_nu_dump:
			cPickle.dump(self.nu, ctm_nu_dump)
		with open('ctm_cov', 'w') as ctm_cov_dump:
			cPickle.dump(self.cov, ctm_cov_dump)
		with open('ctm_inv_cov', 'w') as ctm_inv_cov_dump:
			cPickle.dump(self.inv_cov, ctm_inv_cov_dump)
		with open('ctm_log_det_inv_cov', 'w') as ctm_log_det_inv_cov_dump:
			cPickle.dump(self.log_det_inv_cov, ctm_log_det_inv_cov_dump)
		logger.info("write topic matrix")
		with open('ctm_log_beta', 'w') as ctm_log_beta_dump:
			cPickle.dump(self.log_beta, ctm_log_beta_dump)
Ejemplo n.º 2
0
	def sample_term(self, eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd):
		'''
		Importance sampling the likelihood based on the variational posterior

		Arguments:
			eta : natural parameter of logistic normal distribution
			theta : mean parameter of logistic normal distribution
			The mapping between them is equation 3 in the paper:
					eta[i] = log theta[i] / theta[K]
		Returns:
			value of p(w | eta) - q(eta)
		'''
		t1 = 0.5 * self.log_det_inv_cov
		t1 += -(0.5) * self._K * 1.837877  # 1.837877 is the natural logarithm of 2*pi
		for i in range(self._K):
			for j in range(self._K):
				t1 -= (0.5) * (eta[i] - self.mu[i]) * self.inv_cov[i, j] * (eta[j] - self.mu[j])
		# compute theta
		theta = eta[:]
		sum_t = np.sum(np.exp(eta))
		theta = np.divide(theta, sum_t)

		# compute word probabilities
		nterms = len(obs_wordidsd)
		for n in range(nterms):
			word_term = 0
			for i in range(self._K):
				word_term += theta[i] * np.exp(self.log_beta[i, n])
			ids = obs_wordidsd.index(i)
			t1 += obs_wordctsd[ids] * math_utli.safe_log(word_term)
		# log(q(\eta | lambda, nu))
		t2 = 0
		for i in range(self._K):
			t2 += stats.norm.pdf(eta[i] - lambda_v[i], np.sqrt(nu_v[i]))
		return(t1 - t2)
Ejemplo n.º 3
0
	def lhood_bnd(self, d, phi_v, log_phi_v, lambda_v, nu_v, zeta_v):
		'''
		compute the likelihood bound given the variational parameters

		Arguments:
			d : current working docutment index
			variational parameters

		Returns:
			likelihood bound
		'''
		logger.info("calculating likelihood bound")
		# E[log p(\eta | \mu, \Sigma)] + H(q(\eta | \lambda, \nu)
		lhood = (0.5) * self.log_det_inv_cov + 0.5 * self._K
		for i in range(self._K):
			v = - (0.5) * nu_v[i] * self.inv_cov[i, i]
			for j in range(self._K):
				v -= (0.5) * (lambda_v[i] - self.mu[i]) * self.inv_cov[i, j] * (lambda_v[j] - self.mu[j])
			v += (0.5) * math_utli.safe_log(nu_v[i])
			lhood += v

		# E[log p(z_n | \eta)] + E[log p(w_n | \beta)] + H(q(z_n | \phi_n))
		# Equation 7 in paper, calculate the upper bound
		sum_exp = np.sum(np.exp(lambda_v) + 0.5 * nu_v)
		bound = (1.0 / zeta_v) * sum_exp - 1.0 + math_utli.safe_log(zeta_v)
		lhood -= bound * self._D

		ntermd = len(self.wordcts[d])
		for i in range(self._K):
			for j in range(ntermd):
				if phi_v[i,j] > 0:
					# ids = self.wordids[d].index(i)
					logger.info("iteration %i, %i", i, j)
					logger.info('lhood-track-1 : %f', lhood)
					# logger.info('self.wordcts %i', self.wordcts[d][i])
					# logger.info('phi_v -  %f', phi_v[i][j])
					# logger.info('lambda_v - %f',lambda_v[i])
					logger.info('self.log_beta - %f',  self.log_beta[i][j])
					# logger.info('log_phi_v - %f', log_phi_v[i][j])
					lhood = lhood + self.wordcts[d][j] * phi_v[i][j] * (lambda_v[i] + self.log_beta[i][j] - log_phi_v[i][j])
					logger.info('lhood-track-2 : %f', lhood)
		return lhood
Ejemplo n.º 4
0
	def log_mult_prob(self, held_wordctsd, e_theta):
		'''
		 log probability of the document under proportions theta and topics beta
		 used to calculate the held-out data's probability

		 '''
		val = 0
		nterms = len(held_wordctsd)
		for i in range(nterms):
		# here the number W should be the number of held-out data
		# log_beta should be initialized, not the old self.log_beta
			term_prob = 0
			for k in range(self._K):
				term_prob += e_theta[k] * np.exp(self.log_beta[k, i])
			val += math_utli.safe_log(term_prob) * held_wordctsd[i]
		return val
Ejemplo n.º 5
0
		def f_lambda(self, sum_phi, phi_v, lambda_v, nu_v, zeta_v):
			temp1 = np.zeros(self._K)

			term1 = term2 = term3 = 0
			# compute lambda^T * \sum phi
			term1 = np.dot(lambda_v * sum_phi)
			# compute lambda - mu (= temp1)
			temp1 += np.subtract(lambda_v, self.mu)
			# compute (lambda - mu)^T Sigma^-1 (lambda - mu)
			term2 = (-0.5) * temp1 * self.inv_cov * temp1
			# last term
			for i in range(self._K):
				term3 += np.exp(lambda_v[i] + 0.5 * nu_v[i])
			# need to figure out how term3 is calculated
			term3 = - ((1.0 / zeta_v) * term3 - 1.0 + math_utli.safe_log(zeta_v)) * self._K
			return (-(term1 + term2 + term3))
Ejemplo n.º 6
0
	def opt_nu(self, lambda_v, zeta_v):
		logger.info("calculating variational parameter NU")
		# optimize nu
		df = d2f = 0
		nu_v = np.dot(10, np.ones(self._K))
		log_nu_v = np.log(nu_v)

		for i in range(self._K):
			while np.fabs(df) > 1e-10:
				nu_v[i] = np.exp(log_nu_v[i])
				if math.isnan(nu_v[i]):
					nu_v[i] = 20
					log_nu_v[i] = math_utli.safe_log(nu_v[i])
				df = - np.dot(0.5, self.inv_cov[i, i]) - np.dot((0.5 * self._W / zeta_v), np.exp(lambda_v[i] + nu_v[i] / 2)) + (0.5 * (1.0 / nu_v[i]))
				d2f = - np.dot((0.25 * (self._W / zeta_v)), np.exp(lambda_v[i] + nu_v[i] / 2)) - (0.5 * (1.0 / nu_v[i] * nu_v[i]))
				log_nu_v[i] = log_nu_v[i] - (df * nu_v[i]) / (d2f * nu_v[i] * nu_v[i] + df * nu_v[i])
		nu_v = np.exp(log_nu_v)

		return nu_v
Ejemplo n.º 7
0
	def expected_theta(self, obs_wordidsd, obs_wordctsd, lambda_v, nu_v):
		''' Return expected theta under a variational distribution

		Args:
			self : use all the parameters initialized before
			lambda_v : variational parameter lambda
			nu_v : variational parameter nu

		Returns:
			val : the expected theta
		'''
		nsamples = 100
		eta = np.zeros(self._K)
		theta = eta[:]
		# initialize e_theta
		e_theta = -1.0 * np.ones(self._K)
		# for each sample
		nterms = len(obs_wordidsd)
		for n in range(nterms):
			# sample eta from q(\eta)
			for i in range(self._K):
				eta[i] = random.gauss(0, np.sqrt(nu_v[i])) + lambda_v[i]
			# compute p(w | \eta) - q(\eta)
			log_prob = self.sample_term(eta, lambda_v, nu_v, obs_wordidsd, obs_wordctsd)
			# compute theta
			theta = eta[:]
			sum_t = np.sum(np.exp(eta))
			theta = np.divide(theta, sum_t)

			# update e_theta
			for i in range(self._K):
				e_theta[i] = math_utli.log_sum(e_theta[i], log_prob + math_utli.safe_log(theta[i]))
		# normalize e_theta and set return vector
		sum_et = -1.0
		for i in range(self._K):
			e_theta[i] -= np.log(nsamples)
			sum_et = math_utli.log_sum(sum_et, e_theta[i])
		e_theta = np.exp(np.subtract(e_theta, sum_et))
		return e_theta
Ejemplo n.º 8
0
	def __init__(self, K, mu=None, cov=None):
		'''
		Arguments:
			K: Number of topics
			D: Total number of documents in the population. For a fixed corpus,
			   this is the size of the corpus.
			mu and cov: the hyperparameters logistic normal distribution for prior on weight vectors theta
		'''

		logger.info("CTM commence.")
		logger.info("Initializing...")
		if K is None is None:
			raise ValueError('number of topics have to be specified.')
		# get the folder name which containing all the training files
		# we will have to manually specific the observed and heldout folders
		obs_filenames = os.listdir('d:\\mycode\\ctm_python\\state_union\\observed\\')
		path = 'd:\\mycode\\ctm_python\\state_union\\observed\\'

		logger.info("initializing id mapping from corpus, assuming identity")
		#initial a string to save all the file contents
		txt_corpus = []
		for thefile in obs_filenames:
			with open(path + thefile, "rb") as f:
				strings = f.read()
				txt_corpus.append(strings)
		(dictionary, corpus) = preprocess.get_dict_and_corp(txt_corpus)
		logger.info("dictionary and corpus are generated")

		self.dictionary = dictionary
		self.corpus = corpus

		self._K = K                     # number of topics
         	logger.info("There are %i topics.", self._K)
		self._W = len(dictionary)   # number of all the terms
		self._D = len(corpus)       # number of documents

		# initialize wordid and wordcount list for the whole corpus
		self.wordids = list()
		self.wordcts = list()

		for d, doc in enumerate(self.corpus):
			wordidsd = [id for id, _ in doc]
			wordctsd = np.array([cnt for _, cnt in doc])
			self.wordids.append(wordidsd)
			self.wordcts.append(wordctsd)

		# mu   : K-size vector with 0 as initial value
		# cov  : K*K matrix with 1 as initial value , together they make a Gaussian
		if mu is None:
			self.mu = np.zeros(self._K)
		else:
			self.mu = mu
		if cov is None:
			self.cov = np.ones((self._K, self._K))
		else:
			self.cov = cov

		# if cov is initialized by the process, then there is no need to calculate
		# inverse of cov, since it is singular matrix
		try:
			self.inv_cov = np.linalg.inv(self.cov)
		except np.linalg.linalg.LinAlgError as err:
			if 'Singular matrix' in err.message:
				self.inv_cov = self.cov
		  	else:
		  		pass

		# self.inv_cov = np.linalg.inv(self.cov)
		self.log_det_inv_cov = math_utli.safe_log(np.linalg.det(self.inv_cov))

		self.ndata = 0  # cumulate count of number of docs processed

		# initialize topic distribution, i.e. self.log_beta
		sum = 0
		self.beta = np.zeros([self._K, self._W])
		self.log_beta = np.zeros([self._K, self._W])

		for i in range(self._K):
			# initialize beta with a randomly chosen doc
			# stuff K topics randomly
			doc_no = np.random.randint(self._D)
			nterms = len(self.wordcts[doc_no])
			for j in range(nterms):
				word_index = self.wordids[doc_no][j]
				self.log_beta[i][word_index] = self.wordcts[doc_no][j] 
				# logging.info('self.log_beta[i,j]-track %f', self.log_beta[i,j]) 

		for m in range(self._K):
			for n in range(self._W):
				self.log_beta[m][n] = self.log_beta[m][n] + 1.0 + np.random.ranf()
				# logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		# to initialize and smooth
		sum = math_utli.safe_log(np.sum(self.log_beta))
		logger.info("log_beta_sum : %f", sum)

		# little function to normalize self.log_beta
		def element_add(x):
			return x +  math_utli.safe_log(x-sum)
		self.log_beta = map(element_add, self.log_beta)
		for m in range(self._K):
			for n in range(self._W):
				logger.info("log_beta %i %i - %f", m,n, self.log_beta[m][n] )

		logger.info("initialization finished.")
Ejemplo n.º 9
0
		def element_add(x):
			return x +  math_utli.safe_log(x-sum)