def test(args): corpus = load_corpus(args.corpus) vocab, docs = corpus['vocab'], corpus['docs'] doc_bow = {} for k in docs.keys(): bows = [] for idx, count in docs[k].iteritems(): bows.append((int(idx), count)) doc_bow[k] = bows del docs[k] lda = load_model(args.load_model) generate_doc_codes(lda, doc_bow, args.output) print 'Saved doc codes file to %s' % args.output if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',\ 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',\ 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',\ 'tax', 'march', 'june'] # queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', \ # 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', \ # 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', \ # 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', \ # 'business', 'transaction', 'govern', 'trading', 'tax', 'three', 'four', 'five', \ # 'eleven', 'thirteen', 'fifteen', 'eighteen', 'twenty'] weights = lda.state.get_lambda() weights = np.apply_along_axis(lambda x: x / x.sum(), 1, weights) # get dist. # weights = unitmatrix(weights, axis=1) # normalize word_cloud(weights.T, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.save_topics: topics_prob = show_topics_prob(lda) save_topics_prob(topics_prob, args.save_topics) # topics = show_topics(lda) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.calc_distinct: # mean, std = calc_pairwise_cosine(lda) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(lda) print 'Average squared deviation from 0 (90 degree): %s' % sd
def train(args): corpus = load_corpus(args.corpus) docs, vocab_dict = corpus['docs'], corpus['vocab'] doc_bow = [] doc_keys = docs.keys() for k in doc_keys: bows = [] for idx, count in docs[k].iteritems(): bows.append((int(idx), count)) doc_bow.append(bows) del docs[k] vocab_dict = dict([(int(y), x) for x, y in vocab_dict.iteritems()]) n_samples = len(doc_bow) doc_bow = np.array(doc_bow) np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) dbow_train = doc_bow[train_idx].tolist() dbow_val = doc_bow[val_idx].tolist() del doc_bow start = timeit.default_timer() lda = train_lda(dbow_train, vocab_dict, args.n_topics, args.n_iter, args.save_model) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: doc_keys = np.array(doc_keys) generate_doc_codes(lda, dict(zip(doc_keys[train_idx].tolist(), dbow_train)), args.output + '.train') generate_doc_codes(lda, dict(zip(doc_keys[val_idx].tolist(), dbow_val)), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val')