def _topic_divergence(self, msg_ids, id2msg, dictionary, lda): raw_topics = [ lda.get_document_topics(dictionary.doc2bow( IU.tokenize_document(id2msg[id_])), minimum_probability=0) for id_ in msg_ids ] topic_vects = np.array([[v for _, v in topics] for topics in raw_topics]) mean_topic_vect = np.mean(topic_vects, axis=0) diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects] return np.mean(diffs)
def frequent_terms(self, interactions, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body']) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) tokens = IU.tokenize_document(concated_msg) freqs = Counter(tokens) terms = [t for t, _ in freqs.most_common(top_k)] print 'frequent_terms', terms return terms
def tfidf_terms(self, interactions, dictionary, top_k=10): text = '\n'.join( ['{} {}'.format(m['subject'], m['body']) for m in interactions]) tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl')) counts = dictionary.doc2bow(IU.tokenize_document(text)) raw_vect = np.zeros(len(dictionary.keys())) for word, cnt in counts: raw_vect[word] = cnt vect = tfidf_vec.transform([raw_vect]) vect = np.asarray(vect.todense()).flatten() tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]] print 'tfidf_terms', tfidf_terms return tfidf_terms
def frequent_terms(self, interactions, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format( m['subject'], m['body'] ) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) tokens = IU.tokenize_document(concated_msg) freqs = Counter(tokens) terms = [t for t, _ in freqs.most_common(top_k)] print 'frequent_terms', terms return terms
def topics(self, interactions, dictionary, lda, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format( m['subject'], m['body'] ) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) bow = dictionary.doc2bow(IU.tokenize_document(concated_msg)) topic_dist = lda.__getitem__(bow, iterations=100) print("topic inference done") # topic_dist = lda.get_document_topics( # bow, # minimum_probability=0 # ) topic_dist = np.asarray([v for _, v in topic_dist]) # some mask to filter out trivial topics topic_dist[topic_dist < 0.05] = 0 # topic_terms if not hasattr(lda, 'wordtopics'): lda.load_word_topics() beta = lda.wordtopics # beta = lda.state.get_lambda() # normalize and weight by beta dist weighted_terms = ( beta / beta.sum(axis=1)[:, None] * topic_dist[:, None] ).sum(axis=0) bestn = np.argsort(weighted_terms)[::-1][:top_k] topic_terms = [lda.id2word[id] for id in bestn] top_topics = np.nonzero(topic_dist) # np.argsort(topic_dist)[::-1][:3] print('top_topics', top_topics) # topic_divergence = self._topic_divergence(message_ids, id2msg, # dictionary, lda) return {# 'topic_dist': topic_dist, 'topic_terms': topic_terms, 'top_topics': top_topics # 'topic_divergence': topic_divergence }
def _topic_divergence(self, msg_ids, id2msg, dictionary, lda): raw_topics = [ lda.get_document_topics( dictionary.doc2bow( IU.tokenize_document(id2msg[id_]) ), minimum_probability=0 ) for id_ in msg_ids ] topic_vects = np.array([[v for _, v in topics] for topics in raw_topics]) mean_topic_vect = np.mean(topic_vects, axis=0) diffs = [scipy.stats.entropy(mean_topic_vect, v) for v in topic_vects] return np.mean(diffs)
def tfidf_terms(self, interactions, dictionary, top_k=10): text = '\n'.join(['{} {}'.format(m['subject'], m['body']) for m in interactions]) tfidf_vec = pkl.load(open('/cs/home/hxiao/code/lst/tmp/tfidf.pkl')) counts = dictionary.doc2bow( IU.tokenize_document(text) ) raw_vect = np.zeros(len(dictionary.keys())) for word, cnt in counts: raw_vect[word] = cnt vect = tfidf_vec.transform([raw_vect]) vect = np.asarray(vect.todense()).flatten() tfidf_terms = [dictionary[i] for i in np.argsort(vect)[::-1][:top_k]] print 'tfidf_terms', tfidf_terms return tfidf_terms
def topics(self, interactions, dictionary, lda, top_k=10): id2msg = {} for m in interactions: id2msg[m['message_id']] = u"{} {}".format(m['subject'], m['body']) # topic_dist message_ids = [self.g.node[n]['message_id'] for n in self.g.nodes()] concated_msg = ' '.join([id2msg[mid] for mid in message_ids]) bow = dictionary.doc2bow(IU.tokenize_document(concated_msg)) topic_dist = lda.__getitem__(bow, iterations=100) print("topic inference done") # topic_dist = lda.get_document_topics( # bow, # minimum_probability=0 # ) topic_dist = np.asarray([v for _, v in topic_dist]) # some mask to filter out trivial topics topic_dist[topic_dist < 0.05] = 0 # topic_terms if not hasattr(lda, 'wordtopics'): lda.load_word_topics() beta = lda.wordtopics # beta = lda.state.get_lambda() # normalize and weight by beta dist weighted_terms = (beta / beta.sum(axis=1)[:, None] * topic_dist[:, None]).sum(axis=0) bestn = np.argsort(weighted_terms)[::-1][:top_k] topic_terms = [lda.id2word[id] for id in bestn] top_topics = np.nonzero(topic_dist) # np.argsort(topic_dist)[::-1][:3] print('top_topics', top_topics) # topic_divergence = self._topic_divergence(message_ids, id2msg, # dictionary, lda) return { # 'topic_dist': topic_dist, 'topic_terms': topic_terms, 'top_topics': top_topics # 'topic_divergence': topic_divergence }