def main(): cmd = sys.argv[1] if cmd == 'show': show_summary(load_pkl('blogs.pkl')) # show_summary(load_pkl('blogs-10.pkl')) elif cmd == 'eval': fpath = sys.argv[2] metrics_tf = eval_topics(fpath, top_k=2, method='tf') metrics_tfidf = eval_topics(fpath, top_k=2, method='tfidf') print('tf metrics', metrics_tf) print_metrics_as_table(metrics_tf, 'metrics-tf.tex') print('tfidf metrics', metrics_tfidf) print_metrics_as_table(metrics_tfidf, 'metrics-tfidf.tex') elif cmd == 'stem2word': calc_stem_map()
def eval_topics(fpath, method='tf', top_k=2, num_words_in_topic=10): '''Evaluate topics by: 1. plotting the word cloud 2. calculating the diagnostic and coherence metrics ''' with open(fpath, encoding='utf8') as f: result = json.load(f) global STEM2WORD if STEM2WORD is None: STEM2WORD = load_pkl('stem2word.pkl') def _2w(w): if w in STEM2WORD: return STEM2WORD[w][0][0] else: return w topics_formatted = {} for group, topics2 in result.items(): # print(group) topics = topics2[method] topics_formatted[group] = [] for i, topic in enumerate(topics[:top_k]): topic_name = _2w(topic['topic']) words = {} words.update( (_2w(kw[0]), kw[1]) for kw in topic['keywords'][:(num_words_in_topic - 1)]) if method == 'tf': words[topic_name] = topic['score'] else: try: words[topic_name] = topic['keywords'][0][ 1] * 2 # fake frequency for display except IndexError: words[topic_name] = 1 topics_formatted[group].append((topic_name, words)) print(topics_formatted) plot_topics(topics_formatted, method=method) return calc_coherence_all(topics_formatted, method=method)
def load_glove(ndim=100): global GLOVE if GLOVE is None: print('loading glove embeddings...') try: GLOVE = load_pkl('glove{}.pkl'.format(ndim)) except: print('failed') GLOVE = {} fname = 'embeddings/glove.6B.{}d.txt'.format(ndim) print('load from file', fname) with open(fname) as f: for line in f: arr = line.strip().split() GLOVE[arr[0].strip()] = np.array( [float(f) for f in arr[1:]]) # break # print(GLOVE) save_pkl(GLOVE, 'glove{}.pkl'.format(ndim)) return GLOVE
def calc_stem_map(): '''Map word stem back to the most representative word so we can display valid English words in the word cloud, and also for calculating the coherence score ''' print('building map from stem to words ...') docs = load_pkl('tokenised_docs.pkl') stem2word = defaultdict(lambda *_, **__: Counter()) def _helper(w): s = stem_word(w) stem2word[s][lemmatizer.lemmatize(w.lower())] += 1 # print(stem2word) print('calculating map...') foreach3d(_helper, docs) out = {} for k, cnt in stem2word.items(): out[k] = cnt.most_common(10) # print(out) save_pkl(out, 'stem2word.pkl') return out
def main(): docs, ne = as2.load_pkl('intermediate_data.pkl') ne = Counter((w, t) for w, t in as2.flatten3d(ne) if w in ('lol', 'f**k', 'Im', 'choru')) print(ne.most_common(20)) return
def calc_topic_size(words): global WORD_COUNT if WORD_COUNT is None: docs, _ = load_pkl('intermediate_data.pkl') WORD_COUNT = Counter(w for w, t in flatten3d(docs)) return sum(WORD_COUNT[w] for w in words)