Example #1
0
def test(args):
    corpus = load_corpus(args.corpus)
    vocab, docs = corpus['vocab'], corpus['docs']
    doc_bow = {}
    for k in docs.keys():
        bows = []
        for idx, count in docs[k].iteritems():
            bows.append((int(idx), count))
        doc_bow[k] = bows
        del docs[k]

    lda = load_model(args.load_model)
    generate_doc_codes(lda, doc_bow, args.output)
    print 'Saved doc codes file to %s' % args.output

    if args.word_clouds:
        queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',\
             'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',\
             'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',\
             'tax', 'march', 'june']
        # queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', \
        #      'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', \
        #      'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', \
        #      'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', \
        #      'business', 'transaction', 'govern', 'trading', 'tax', 'three', 'four', 'five', \
        #      'eleven', 'thirteen', 'fifteen', 'eighteen', 'twenty']
        weights = lda.state.get_lambda()
        weights = np.apply_along_axis(lambda x: x / x.sum(), 1,
                                      weights)  # get dist.
        # weights = unitmatrix(weights, axis=1) # normalize
        word_cloud(weights.T, vocab, queries, save_file=args.word_clouds)

        print 'Saved word clouds file to %s' % args.word_clouds

    if args.save_topics:
        topics_prob = show_topics_prob(lda)
        save_topics_prob(topics_prob, args.save_topics)
        # topics = show_topics(lda)
        # write_file(topics, args.save_topics)
        print 'Saved topics file to %s' % args.save_topics

    if args.calc_distinct:
        # mean, std = calc_pairwise_cosine(lda)
        # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)
        sd = calc_pairwise_dev(lda)
        print 'Average squared deviation from 0 (90 degree): %s' % sd
Example #2
0
def train(args):
    corpus = load_corpus(args.corpus)
    docs, vocab_dict = corpus['docs'], corpus['vocab']
    doc_bow = []
    doc_keys = docs.keys()
    for k in doc_keys:
        bows = []
        for idx, count in docs[k].iteritems():
            bows.append((int(idx), count))
        doc_bow.append(bows)
        del docs[k]
    vocab_dict = dict([(int(y), x) for x, y in vocab_dict.iteritems()])

    n_samples = len(doc_bow)
    doc_bow = np.array(doc_bow)
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    dbow_train = doc_bow[train_idx].tolist()
    dbow_val = doc_bow[val_idx].tolist()
    del doc_bow

    start = timeit.default_timer()
    lda = train_lda(dbow_train, vocab_dict, args.n_topics, args.n_iter,
                    args.save_model)
    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        doc_keys = np.array(doc_keys)
        generate_doc_codes(lda,
                           dict(zip(doc_keys[train_idx].tolist(), dbow_train)),
                           args.output + '.train')
        generate_doc_codes(lda, dict(zip(doc_keys[val_idx].tolist(),
                                         dbow_val)), args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')