def _topic_divergence(self, msg_ids, id2msg, dictionary, lda):
        raw_topics = [
            lda.get_document_topics(dictionary.doc2bow(
                IU.tokenize_document(id2msg[id_])),
                                    minimum_probability=0) for id_ in msg_ids
        ]
        topic_vects = np.array([[v for _, v in topics]
                                for topics in raw_topics])
        mean_topic_vect = np.mean(topic_vects, axis=0)
        diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects]

        return np.mean(diffs)
    def frequent_terms(self, interactions, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body'])

        # topic_dist
        message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        tokens = IU.tokenize_document(concated_msg)
        freqs = Counter(tokens)
        terms = [t for t, _ in freqs.most_common(top_k)]
        print 'frequent_terms', terms
        return terms
    def tfidf_terms(self, interactions, dictionary, top_k=10):
        text = '\n'.join(
            ['{} {}'.format(m['subject'], m['body']) for m in interactions])
        tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
        counts = dictionary.doc2bow(IU.tokenize_document(text))
        raw_vect = np.zeros(len(dictionary.keys()))
        for word, cnt in counts:
            raw_vect[word] = cnt

        vect = tfidf_vec.transform([raw_vect])
        vect = np.asarray(vect.todense()).flatten()

        tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]]
        print 'tfidf_terms', tfidf_terms
        return tfidf_terms
Ejemplo n.º 4
0
    def frequent_terms(self, interactions, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(
                m['subject'], m['body']
            )

        # topic_dist
        message_ids = [self.g.node[n]['message_id']
                       for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        tokens = IU.tokenize_document(concated_msg)
        freqs = Counter(tokens)
        terms = [t for t, _ in freqs.most_common(top_k)]
        print 'frequent_terms', terms
        return terms
Ejemplo n.º 5
0
    def topics(self, interactions, dictionary, lda, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(
                m['subject'], m['body']
            )

        # topic_dist
        message_ids = [self.g.node[n]['message_id']
                       for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        bow = dictionary.doc2bow(IU.tokenize_document(concated_msg))
        topic_dist = lda.__getitem__(bow, iterations=100)
        print("topic inference done")
        # topic_dist = lda.get_document_topics(
        #     bow,
        #     minimum_probability=0
        # )
        topic_dist = np.asarray([v for _, v in topic_dist])

        # some mask to filter out trivial topics
        topic_dist[topic_dist < 0.05] = 0

        # topic_terms
        if not hasattr(lda, 'wordtopics'):
            lda.load_word_topics()
        beta = lda.wordtopics
        # beta = lda.state.get_lambda()

        # normalize and weight by beta dist
        weighted_terms = (
            beta / beta.sum(axis=1)[:, None] * topic_dist[:, None]
        ).sum(axis=0)

        bestn = np.argsort(weighted_terms)[::-1][:top_k]

        topic_terms = [lda.id2word[id] for id in bestn]
        
        top_topics = np.nonzero(topic_dist)  # np.argsort(topic_dist)[::-1][:3]
        print('top_topics', top_topics)
        # topic_divergence = self._topic_divergence(message_ids, id2msg,
        #                                           dictionary, lda)
        return {# 'topic_dist': topic_dist,
                'topic_terms': topic_terms,
                'top_topics': top_topics
                # 'topic_divergence': topic_divergence
                }
Ejemplo n.º 6
0
    def _topic_divergence(self, msg_ids, id2msg, dictionary, lda):
        raw_topics = [
            lda.get_document_topics(
                dictionary.doc2bow(
                    IU.tokenize_document(id2msg[id_])
                ),
                minimum_probability=0
            )
            for id_ in msg_ids
        ]
        topic_vects = np.array([[v for _, v in topics]
                                for topics in raw_topics])
        mean_topic_vect = np.mean(topic_vects, axis=0)
        diffs = [scipy.stats.entropy(mean_topic_vect, v)
                 for v in topic_vects]

        return np.mean(diffs)
Ejemplo n.º 7
0
    def tfidf_terms(self, interactions, dictionary, top_k=10):
        text = '\n'.join(['{} {}'.format(m['subject'], m['body'])
                   for m in interactions])
        tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl'))
        counts = dictionary.doc2bow(
            IU.tokenize_document(text)
            )
        raw_vect = np.zeros(len(dictionary.keys()))
        for word, cnt in counts:
            raw_vect[word] = cnt

        vect = tfidf_vec.transform([raw_vect])
        vect = np.asarray(vect.todense()).flatten()

        tfidf_terms = [dictionary[i]
                       for i in np.argsort(vect)[::-1][:top_k]]
        print 'tfidf_terms', tfidf_terms
        return tfidf_terms
    def topics(self, interactions, dictionary, lda, top_k=10):
        id2msg = {}
        for m in interactions:
            id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body'])

        # topic_dist
        message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()]
        concated_msg = ' '.join([id2msg[mid] for mid in message_ids])
        bow = dictionary.doc2bow(IU.tokenize_document(concated_msg))
        topic_dist = lda.__getitem__(bow, iterations=100)
        print("topic inference done")
        # topic_dist = lda.get_document_topics(
        #     bow,
        #     minimum_probability=0
        # )
        topic_dist = np.asarray([v for _, v in topic_dist])

        # some mask to filter out trivial topics
        topic_dist[topic_dist < 0.05] = 0

        # topic_terms
        if not hasattr(lda, 'wordtopics'):
            lda.load_word_topics()
        beta = lda.wordtopics
        # beta = lda.state.get_lambda()

        # normalize and weight by beta dist
        weighted_terms = (beta / beta.sum(axis=1)[:, None] *
                          topic_dist[:, None]).sum(axis=0)

        bestn = np.argsort(weighted_terms)[::-1][:top_k]

        topic_terms = [lda.id2word[id] for id in bestn]

        top_topics = np.nonzero(topic_dist)  # np.argsort(topic_dist)[::-1][:3]
        print('top_topics', top_topics)
        # topic_divergence = self._topic_divergence(message_ids, id2msg,
        #                                           dictionary, lda)
        return {  # 'topic_dist': topic_dist,
            'topic_terms': topic_terms,
            'top_topics': top_topics
            # 'topic_divergence': topic_divergence
        }