Ejemplo n.º 1
0
    def initialize(self, Ks, Kb):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None
        assert Ks is not None
        assert Kb is not None

        K = Ks + Kb

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K

        D = self.D
        W = self.W

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alpha = np.ones((K,)) * (1.0 / K)

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        document_Nds = self.num_words_per(self.documents)
        self.phi = [(np.ones((document_Nds[d], K))*(1.0/K)) for d in xrange(D)]

        self.gamma = np.ones((D, K)) * (1.0 / K)
        graphlib.initialize_random(self.gamma)

        self.eta = graphlib.random_normal(0, 2.0, (Ks,))
        self.sigma_squared = 0.5

        print 'eta start: {0}'.format(self.eta)

        self.is_initialized = True
Ejemplo n.º 2
0
    def initialize(self, Ku, Ks, Kb):
        """Accepts K number of topics in document.
            Initializes all of the hidden variable arrays now that it knows dimensions
            of topics, vocabulary, etc.
        """
        assert self.documents is not None
        assert Ku is not None
        assert Ks is not None
        assert Kb is not None

        K = Ku + Ks + Kb

        # give at least more documents than topics
        # so that it's not singular
        assert self.D > K

        self.K = K
        self.Ku = Ku
        self.Ks = Ks
        self.Kb = Kb

        self.Kc = self.Ku + self.Ks
        self.Kl = self.Ks + self.Kb

        W = self.W

        # Initialize the variational distribution q(beta|lambda)
        self.beta = topiclib.initialize_beta(K, W)

        # "it suffices to fix alpha to uniform 1/K"
        # initialize to ones so that the topics are more evenly distributed
        # good for small datasets
        self.alphaU = np.ones((Ku,)) * (1.0 / Ku)
        self.alphaS = np.ones((Ks,)) * (1.0 / Ks)
        self.alphaB = np.ones((Kb,)) * (1.0 / Kb)

        # todo: not using this yet
        #self.alphaD = ...
        
        def uniform_phi(Nds, size):
            D = len(Nds)
            return [(np.ones((Nds[d], size)) * (1.0 / size)) for d in xrange(D)]

        document_Nds = self.num_words_per(self.documents)
        self.phiD = uniform_phi(document_Nds, self.Ku)
        comment_Nds = self.num_words_per(self.comments)
        self.phiC = uniform_phi(comment_Nds, self.Kc)
        labeled_Nds = self.num_words_per(self.labeled)
        self.phiL = uniform_phi(labeled_Nds, self.Kl)
        background_Nds = self.num_words_per(self.background)
        self.phiB = uniform_phi(background_Nds, self.Kb)

        self.num_document_words = sum(document_Nds)
        self.num_comment_words = sum(comment_Nds)
        self.num_labeled_words = sum(labeled_Nds)
        self.num_background_words = sum(background_Nds)

        biggest = float(max(self.num_document_words, self.num_comment_words,
                      self.num_labeled_words, self.num_background_words))
        self.document_multiplier = biggest / self.num_document_words
        self.comment_multiplier = biggest / self.num_comment_words
        self.labeled_multiplier = biggest / self.num_labeled_words
        self.background_multiplier = biggest / self.num_background_words

        self.gammaD = np.ones((self.D, self.Ku)) * (1.0 / self.Ku)
        self.gammaC = np.ones((self.D, self.Kc)) * (1.0 / self.Kc)
        self.gammaL = np.ones((self.L, self.Kl)) * (1.0 / self.Kl)
        self.gammaB = np.ones((self.B, self.Kb)) * (1.0 / self.Kb)
        graphlib.initialize_random(self.gammaD)
        graphlib.initialize_random(self.gammaC)
        graphlib.initialize_random(self.gammaL)
        graphlib.initialize_random(self.gammaB)

        self.eta = graphlib.random_normal(0, 2.0, (Ks,))
        self.sigma_squared = 0.5

        print 'eta start: {0}'.format(self.eta)

        self.is_initialized = True