def plot(self, query, nbest=15): if ', ' not in query: words = [query] + w2v.search(self.data, query)[:nbest] else: words = query.split(', ') print ', '.join(words) mat = w2v.get_vectors(self.data) word_indexes = [w2v.get_word_index(self.data, w) for w in words] if word_indexes == [-1]: print 'not in vocabulary' return # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X) xs = X[:, 0] ys = X[:, 1] # draw plt.figure(figsize=(12, 8)) plt.scatter(xs, ys, marker='o') for i, w in enumerate(words): plt.annotate(w.decode('utf-8', 'ignore'), xy=(xs[i], ys[i]), xytext=(3, 3), textcoords='offset points', ha='left', va='top', **self.TEXT_KW) plt.show()
print 'loading' data = w2v.load(filename) print 'loaded' nbest = 15 while True: query = raw_input('query: ') if query.startswith('nbest='): nbest = int(query[6:]) continue if ', ' not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(', ') print ', '.join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words] if word_indexes == [-1]: print 'not in vocabulary' continue # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X) xs = X[:, 0] ys = X[:, 1] # draw
print "loading" data = w2v.load(filename) print "loaded" nbest = 15 while True: query = raw_input("query: ") if query.startswith("nbest="): nbest = int(query[6:]) continue if ", " not in query: words = [query] + w2v.search(data, query)[:nbest] else: words = query.split(", ") print ", ".join(words) mat = w2v.get_vectors(data) word_indexes = [w2v.get_word_index(data, w) for w in words] if word_indexes == [-1]: print "not in vocabulary" continue # do PCA X = mat[word_indexes] pca = PCA(n_components=2) pca.fit(X) print pca.explained_variance_ratio_ X = pca.transform(X) xs = X[:, 0] ys = X[:, 1] # draw
num_words = w2v.get_num_words_from_model() data = w2v.load("tiny_corpus.bin"); test( w2v.get_num_words(data), num_words) test( w2v.search(data, "1")[:5], ['a', 'A', 'g', 'G', '6']) data2 = w2v.load_without_normalize("tiny_corpus.bin"); words = [w2v.get_word(data2, i) for i in range(w2v.get_num_words(data2))] test(words[1], 'd') test(w2v.get_vector(data2, 'd'), invec) test(list(w2v.get_vectors(data2)[1]), invec) test(w2v.find_sub(data, ['A', 'b'], ['a'])[:5], ['B', '2', 'H', 'h', '8']) # 'A' + 'b' - 'a' # test in C++, for easy develop w2v.test()