def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=50) print_topics(topics_strength) # save_topics_strength(topics_strength, args.save_topics) save_chinese_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july'] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.sample_words: revocab = revdict(vocab) queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space'] words = [] for each in queries: if each in vocab: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print 'Saved sample words file to %s' % args.sample_words if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print each print translate_words(ae, each, vocab, revocab, topn=10) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print 'Average squared deviation from 0 (90 degree): %s' % sd
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = list(docs.keys()) X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.word_clouds: queries = [ 'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july' ] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print('Saved word clouds file to %s' % args.word_clouds) if args.sample_words: revocab = revdict(vocab) while True: print("----------------------------\n? ", end='') sys.stdout.flush() query = sys.stdin.readline() query = re.sub(r'[^\w\s-]', ' ', query) # remove punctuations except hyphen query_words = [] for word in query.lower().split(): # convert to lowercase if word not in stopwords.words('english'): # remove stop words query_words.append(word) # ===== make the query length to be (32) = times_steps size """long_enough = False while not long_enough: for word in query_words: query_vectors.append(word2vec_map[word]) if len(query_vectors) == 32: long_enough = True break""" words = [] for each in query_words: words.append( get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)