def initialize(self, Ks, Kb): """Accepts K number of topics in document. Initializes all of the hidden variable arrays now that it knows dimensions of topics, vocabulary, etc. """ assert self.documents is not None assert Ks is not None assert Kb is not None K = Ks + Kb # give at least more documents than topics # so that it's not singular assert self.D > K self.K = K D = self.D W = self.W # "it suffices to fix alpha to uniform 1/K" # initialize to ones so that the topics are more evenly distributed # good for small datasets self.alpha = np.ones((K,)) * (1.0 / K) # Initialize the variational distribution q(beta|lambda) self.beta = topiclib.initialize_beta(K, W) document_Nds = self.num_words_per(self.documents) self.phi = [(np.ones((document_Nds[d], K))*(1.0/K)) for d in xrange(D)] self.gamma = np.ones((D, K)) * (1.0 / K) graphlib.initialize_random(self.gamma) self.eta = graphlib.random_normal(0, 2.0, (Ks,)) self.sigma_squared = 0.5 print 'eta start: {0}'.format(self.eta) self.is_initialized = True
def initialize(self, Ku, Ks, Kb): """Accepts K number of topics in document. Initializes all of the hidden variable arrays now that it knows dimensions of topics, vocabulary, etc. """ assert self.documents is not None assert Ku is not None assert Ks is not None assert Kb is not None K = Ku + Ks + Kb # give at least more documents than topics # so that it's not singular assert self.D > K self.K = K self.Ku = Ku self.Ks = Ks self.Kb = Kb self.Kc = self.Ku + self.Ks self.Kl = self.Ks + self.Kb W = self.W # Initialize the variational distribution q(beta|lambda) self.beta = topiclib.initialize_beta(K, W) # "it suffices to fix alpha to uniform 1/K" # initialize to ones so that the topics are more evenly distributed # good for small datasets self.alphaU = np.ones((Ku,)) * (1.0 / Ku) self.alphaS = np.ones((Ks,)) * (1.0 / Ks) self.alphaB = np.ones((Kb,)) * (1.0 / Kb) # todo: not using this yet #self.alphaD = ... def uniform_phi(Nds, size): D = len(Nds) return [(np.ones((Nds[d], size)) * (1.0 / size)) for d in xrange(D)] document_Nds = self.num_words_per(self.documents) self.phiD = uniform_phi(document_Nds, self.Ku) comment_Nds = self.num_words_per(self.comments) self.phiC = uniform_phi(comment_Nds, self.Kc) labeled_Nds = self.num_words_per(self.labeled) self.phiL = uniform_phi(labeled_Nds, self.Kl) background_Nds = self.num_words_per(self.background) self.phiB = uniform_phi(background_Nds, self.Kb) self.num_document_words = sum(document_Nds) self.num_comment_words = sum(comment_Nds) self.num_labeled_words = sum(labeled_Nds) self.num_background_words = sum(background_Nds) biggest = float(max(self.num_document_words, self.num_comment_words, self.num_labeled_words, self.num_background_words)) self.document_multiplier = biggest / self.num_document_words self.comment_multiplier = biggest / self.num_comment_words self.labeled_multiplier = biggest / self.num_labeled_words self.background_multiplier = biggest / self.num_background_words self.gammaD = np.ones((self.D, self.Ku)) * (1.0 / self.Ku) self.gammaC = np.ones((self.D, self.Kc)) * (1.0 / self.Kc) self.gammaL = np.ones((self.L, self.Kl)) * (1.0 / self.Kl) self.gammaB = np.ones((self.B, self.Kb)) * (1.0 / self.Kb) graphlib.initialize_random(self.gammaD) graphlib.initialize_random(self.gammaC) graphlib.initialize_random(self.gammaL) graphlib.initialize_random(self.gammaB) self.eta = graphlib.random_normal(0, 2.0, (Ks,)) self.sigma_squared = 0.5 print 'eta start: {0}'.format(self.eta) self.is_initialized = True