def infer(self, input, doc):
     """Perform LDA topic inference on input, and store the results in doc.
     Args:
         input: a list of strings after tokenization.
         doc: LDADoc type or SLDADoc type.
     """
     fix_random_seed()
     if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
         doc.init(self.__model.num_topics())
         doc.set_alpha(self.__model.alpha())
         for token in input:
             id_ = self.__model.term_id(token)
             if id_ != OOV:
                 init_topic = rand_k(self.__model.num_topics())
                 doc.add_token(Token(init_topic, id_))
         self.lda_infer(doc, 20, 50)
     elif isinstance(doc, SLDADoc):
         doc.init(self.__model.num_topics())
         doc.set_alpha(self.__model.alpha())
         for sent in input:
             words = []
             for token in sent:
                 id_ = self.__model.term_id(token)
                 if id_ != OOV:
                     words.append(id_)
             init_topic = rand_k(self.__model.num_topics())
             doc.add_sentence(Sentence(init_topic, words))
         self.slda_infer(doc, 20, 50)
     else:
         logger.error("Wrong Doc Type!")
Exemple #2
0
    def __doc_proposal(self, doc, token):
        if isinstance(doc, LDADoc) and isinstance(token, Token):
            old_topic = token.topic
            dart = rand() * (doc.size() + self.__model.alpha_sum())
            if dart < doc.size():
                token_index = int(dart)
                new_topic = doc.token(token_index).topic
            else:
                new_topic = rand_k(self.__model.num_topics())

            if new_topic != old_topic:
                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
                proportion_old = self.__proportional_function(
                    doc, token, old_topic)
                proportion_new = self.__proportional_function(
                    doc, token, new_topic)
                transition_prob = float((proportion_new * proposal_old) /
                                        (proportion_old * proposal_new))
                rejection = rand()
                mask = -(rejection < transition_prob)
                return (new_topic & mask) | (old_topic & ~mask)

            return new_topic

        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
            sent = token
            old_topic = sent.topic
            dart = rand() * (doc.size() + self.__model.alpha_sum())
            if dart < doc.size():
                token_index = int(dart)
                new_topic = doc.sent(token_index).topic
            else:
                new_topic = rand_k(self.__model.num_topics())

            if new_topic != old_topic:
                proportion_old = self.__proportional_function(
                    doc, sent, old_topic)
                proportion_new = self.__proportional_function(
                    doc, sent, new_topic)
                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
                transition_prob = float((proportion_new * proposal_old) /
                                        (proportion_old * proposal_new))
                rejection = rand()
                mask = -(rejection < transition_prob)
                return (new_topic & mask) | (old_topic & ~mask)

            return new_topic
Exemple #3
0
 def generate(self):
     """Generate samples from given distribution.
     """
     dart1 = rand_k(self.size())
     dart2 = int(rand())
     return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]