Beispiel #1
0
    def _test_lda_importance_sampling(self, testing_df, fold_idx, 
                                      is_num_samples, is_iters, 
                                      training_gibbs, use_posterior_alpha=True):

        print "Run testing importance sampling " + str(testing_df.shape)
        topics = training_gibbs.topic_word_

        if use_posterior_alpha:
            # use posterior alpha as the topic prior during importance sampling
            topic_prior = training_gibbs.posterior_alpha[:, None]
        else:
            # use prior alpha as the topic prior during importance sampling
            topic_prior = np.ones((self.K, 1))
            topic_prior = topic_prior / np.sum(topic_prior)            
            topic_prior = topic_prior * self.K * self.alpha

        print 'topic_prior = ' + str(topic_prior)
        marg = 0         
        n_words = 0
        for d in range(testing_df.shape[0]):
            document = self.df.iloc[[d]]
            words = utils.word_indices(document)
            doc_marg = ldae_is_variants(words, topics, topic_prior, 
                                     num_samples=is_num_samples, variant=3, variant_iters=is_iters)
            print "\td = " + str(d) + " doc_marg=" + str(doc_marg)
            sys.stdout.flush()                
            marg += doc_marg              
            n_words += len(words)

        perp = np.exp(-(marg/n_words))

        print "lda is testing log evidence fold " + str(fold_idx) + " = " + str(marg)
        print "lda is testing perplexity fold " + str(fold_idx) + " = " + str(perp)
        print
        return marg, perp
Beispiel #2
0
    def get_motif_contributions(self, parent_peak_id):

        # work out the contributions of different M2Ms
        row_idx = self.ms1["peakID"] == parent_peak_id
        pos = np.nonzero(row_idx.values)[0]
        d = np.asscalar(pos)
        motifs_of_interest = np.nonzero(self.doc_topic[d])[0].tolist()

        document = self.df.iloc[[d]]
        word_idx = utils.word_indices(document)
        results = {}
        for pos in range(len(word_idx)):
            n = word_idx[pos]
            k = self.model.Z[(d, pos)]

            # IMPORTANT: consider only the validated M2M, but a word might be generated by
            # other M2M not in our list!!
            if k in motifs_of_interest:
                word = self.vocab[n]
                if word in results:
                    results[word].append(k)
                else:
                    results[word] = [k]

        contributions = {}
        for word in results:
            topics = Counter(results[word])
            total = float(np.sum(topics.values()))
            ratio = {key: (topics[key] / total) for key in topics}
            contributions[word] = ratio

        return contributions
Beispiel #3
0
    def _test_lda_importance_sampling(self,
                                      testing_df,
                                      fold_idx,
                                      is_num_samples,
                                      is_iters,
                                      training_gibbs,
                                      use_posterior_alpha=True):

        print "Run testing importance sampling " + str(testing_df.shape)
        topics = training_gibbs.topic_word_

        if use_posterior_alpha:
            # use posterior alpha as the topic prior during importance sampling
            topic_prior = training_gibbs.posterior_alpha[:, None]
        else:
            # use prior alpha as the topic prior during importance sampling
            topic_prior = np.ones((self.K, 1))
            topic_prior = topic_prior / np.sum(topic_prior)
            topic_prior = topic_prior * self.K * self.alpha

        print 'topic_prior = ' + str(topic_prior)
        marg = 0
        n_words = 0
        for d in range(testing_df.shape[0]):
            document = self.df.iloc[[d]]
            words = utils.word_indices(document)
            doc_marg = ldae_is_variants(words,
                                        topics,
                                        topic_prior,
                                        num_samples=is_num_samples,
                                        variant=3,
                                        variant_iters=is_iters)
            print "\td = " + str(d) + " doc_marg=" + str(doc_marg)
            sys.stdout.flush()
            marg += doc_marg
            n_words += len(words)

        perp = np.exp(-(marg / n_words))

        print "lda is testing log evidence fold " + str(
            fold_idx) + " = " + str(marg)
        print "lda is testing perplexity fold " + str(fold_idx) + " = " + str(
            perp)
        print
        return marg, perp
Beispiel #4
0
    def __init__(self, df, vocab, K, alpha, beta, random_state=None, previous_model=None, sparse=False):
        """
        Initialises the collapsed Gibbs sampling for LDA

        Arguments:
        - df: the dataframe of counts of vocabularies x documents
        - K: no. of topics
        - alpha: symmetric prior on document-topic assignment
        - beta: symmetric prior on word-topic assignment
        - previous_model: previous LDA run, if any
        """

        print "CGS LDA initialising"
        self.sparse = sparse
        if not self.sparse:
            self.df = df.replace(np.nan, 0)
        else:
            self.df = df

        self.alpha = alpha
        self.beta = beta

        self.D = df.shape[0]    # total no of docs
        self.N = df.shape[1]    # total no of words
        self.vocab = vocab
        assert(len(self.vocab)==self.N)

        # set total no of topics
        self.cv = False
        self.previous_model = previous_model
        if self.previous_model is not None:

            # if some old topics were fixed
            if hasattr(self.previous_model, 'selected_topics'):

                # no. of new topics
                self.K = K

                # no. of previously selected topics
                self.previous_K = len(self.previous_model.selected_topics)

                # Get the previous ckn and ck values from the training stage.
                # During gibbs update in this testing stage, assignment of word
                # to the first previous_K topics will use the previous fixed
                # topic-word distributions -- as specified by previous_ckn and previous_ck
                self.previous_ckn = self.previous_model.selected_ckn
                self.previous_ck = self.previous_model.selected_ck
                self.previous_vocab = self.previous_model.selected_vocab
                assert(len(self.previous_ck)==self.previous_K)
                assert(self.previous_ckn.shape[0]==len(self.previous_ck))
                assert(self.previous_ckn.shape[1]==len(self.previous_vocab))

                # make previous_ckn have the right number of columns
                N_prev_words = len(self.previous_vocab)
                N_diff = self.N - N_prev_words
                temp = np.zeros((self.previous_K, N_diff), int32)
                self.previous_ckn = np.hstack((self.previous_ckn, temp)) # size is previous_K x N

                # make previous_ckn have the right number of rows
                temp = np.zeros((self.K, self.N), int32)
                self.previous_ckn = np.vstack((self.previous_ckn, temp)) # size is (previous_K+K) x N

                # make previous_ck have the right length
                temp = np.zeros(self.K, int32)
                self.previous_ck = np.hstack((self.previous_ck, temp)) # length is (previous_K+K)

                # total no. of topics = old + new topics
                self.K = self.K + self.previous_K
                print "Total no. of topics = " + str(self.K)

                # set the first previous-K elements in alpha to the previous value
                self.alpha = np.ones(self.K) * alpha
                for k in range(self.previous_K):
                    self.alpha[k] = self.previous_model.selected_alpha[k]

                # set previous beta for the words that have been fixed from before
                self.beta = np.ones(self.N) * beta
                for n in range(N_prev_words):
                    self.beta[n] = self.previous_model.selected_beta[n]

            else:

                # otherwise all previous topics were fixed, for cross-validation
                self.cv = True
                self.K = K
                self.previous_ckn = self.previous_model.ckn
                self.previous_ck = self.previous_model.ck
                self.previous_K = K
                self.alpha = np.ones(self.K) * alpha
                self.beta = np.ones(self.N) * beta
        else:

            # for training stage
            self.K = K
            self.previous_ckn = np.zeros((self.K, self.N), int32)
            self.previous_ck = np.zeros(self.K, int32)
            self.previous_K = 0 # no old topics
            self.alpha = np.ones(self.K) * alpha
            self.beta = np.ones(self.N) * beta


        # make the current arrays too
        self.ckn = np.zeros((self.K, self.N), int32)
        self.ck = np.zeros(self.K, int32)
        self.cdk = np.zeros((self.D, self.K), int32)
        self.cd = np.zeros(self.D, int32)

        # make sure to get the same results from running gibbs each time
        if random_state is None:
            self.random_state = RandomState(1234567890)
        else:
            self.random_state = random_state

        # randomly assign words to topics
        self.Z = {}
        for d in range(self.D):
            if d%10==0:
                sys.stdout.write('.')
                sys.stdout.flush()
            document = self.df[d, :] if self.sparse else self.df.iloc[[d]]
            word_idx = utils.word_indices(document, sparse=sparse)
            for pos, n in enumerate(word_idx):
                k = self.random_state.randint(self.K)
                self.cdk[d, k] += 1
                self.cd[d] += 1
                self.ckn[k, n] += 1
                self.ck[k] += 1
                self.Z[(d, pos)] = k
        print

        # turn word counts in the document into a vector of word occurences
        self.document_indices = {}
        for d in range(self.D):
            document = self.df[d, :] if self.sparse else self.df.iloc[[d]]
            word_idx = utils.word_indices(document, sparse=self.sparse)
            word_locs = []
            for pos, n in enumerate(word_idx):
                word_locs.append((pos, n))
            self.document_indices[d] = word_locs

        self.samples = [] # store the samples