Beispiel #1
0
def main():
    cmd = sys.argv[1]
    if cmd == 'show':
        show_summary(load_pkl('blogs.pkl'))
        # show_summary(load_pkl('blogs-10.pkl'))
    elif cmd == 'eval':
        fpath = sys.argv[2]
        metrics_tf = eval_topics(fpath, top_k=2, method='tf')
        metrics_tfidf = eval_topics(fpath, top_k=2, method='tfidf')
        print('tf metrics', metrics_tf)
        print_metrics_as_table(metrics_tf, 'metrics-tf.tex')
        print('tfidf metrics', metrics_tfidf)
        print_metrics_as_table(metrics_tfidf, 'metrics-tfidf.tex')
    elif cmd == 'stem2word':
        calc_stem_map()
Beispiel #2
0
def eval_topics(fpath, method='tf', top_k=2, num_words_in_topic=10):
    '''Evaluate topics by:
      1. plotting the word cloud
      2. calculating the diagnostic and coherence metrics
    '''

    with open(fpath, encoding='utf8') as f:
        result = json.load(f)

    global STEM2WORD
    if STEM2WORD is None:
        STEM2WORD = load_pkl('stem2word.pkl')

    def _2w(w):
        if w in STEM2WORD:
            return STEM2WORD[w][0][0]
        else:
            return w

    topics_formatted = {}
    for group, topics2 in result.items():
        # print(group)
        topics = topics2[method]
        topics_formatted[group] = []
        for i, topic in enumerate(topics[:top_k]):
            topic_name = _2w(topic['topic'])
            words = {}
            words.update(
                (_2w(kw[0]), kw[1])
                for kw in topic['keywords'][:(num_words_in_topic - 1)])
            if method == 'tf':
                words[topic_name] = topic['score']
            else:
                try:
                    words[topic_name] = topic['keywords'][0][
                        1] * 2  # fake frequency for display
                except IndexError:
                    words[topic_name] = 1
            topics_formatted[group].append((topic_name, words))

    print(topics_formatted)
    plot_topics(topics_formatted, method=method)
    return calc_coherence_all(topics_formatted, method=method)
Beispiel #3
0
def load_glove(ndim=100):
    global GLOVE
    if GLOVE is None:
        print('loading glove embeddings...')
        try:
            GLOVE = load_pkl('glove{}.pkl'.format(ndim))
        except:
            print('failed')
            GLOVE = {}
            fname = 'embeddings/glove.6B.{}d.txt'.format(ndim)
            print('load from file', fname)
            with open(fname) as f:
                for line in f:
                    arr = line.strip().split()
                    GLOVE[arr[0].strip()] = np.array(
                        [float(f) for f in arr[1:]])
                    # break
                # print(GLOVE)

            save_pkl(GLOVE, 'glove{}.pkl'.format(ndim))

    return GLOVE
Beispiel #4
0
def calc_stem_map():
    '''Map word stem back to the most representative word so we can display valid
    English words in the word cloud, and also for calculating the coherence score
    '''

    print('building map from stem to words ...')
    docs = load_pkl('tokenised_docs.pkl')

    stem2word = defaultdict(lambda *_, **__: Counter())

    def _helper(w):
        s = stem_word(w)
        stem2word[s][lemmatizer.lemmatize(w.lower())] += 1
        # print(stem2word)

    print('calculating map...')
    foreach3d(_helper, docs)

    out = {}
    for k, cnt in stem2word.items():
        out[k] = cnt.most_common(10)
    # print(out)
    save_pkl(out, 'stem2word.pkl')
    return out
Beispiel #5
0
def main():
    docs, ne = as2.load_pkl('intermediate_data.pkl')
    ne = Counter((w, t) for w, t in as2.flatten3d(ne)
                 if w in ('lol', 'f**k', 'Im', 'choru'))
    print(ne.most_common(20))
    return
Beispiel #6
0
def calc_topic_size(words):
    global WORD_COUNT
    if WORD_COUNT is None:
        docs, _ = load_pkl('intermediate_data.pkl')
        WORD_COUNT = Counter(w for w, t in flatten3d(docs))
    return sum(WORD_COUNT[w] for w in words)