def get_training_data(vectors_filename, vocab_filename, vocab_thresh): bigrams_vocab = load_bigrams(vocab_filename, vocab_thresh) word_vectors = word_vecs.load(vectors_filename) n = 300 N = len(bigrams_vocab) # Construct matrix B row-wise print 'Constructing matrix B row-wise' B = np.zeros(shape=(N, 2 * n)) for idx, label in enumerate(bigrams_vocab): b_1 = word_vectors[label.split('_')[0]] b_2 = word_vectors[label.split('_')[1]] row = np.concatenate((b_1, b_2)) B[idx] = row # if idx > 5000: # break # B = np.matrix(B) # Construct Y Matrix (an individual column from this will make for a y-vector) print 'Constructing matrix Y row-wise' Y = np.zeros(shape=(N, n)) for idx, label in enumerate(bigrams_vocab): row = word_vectors[label] Y[idx] = row # if idx > 5000: # break Y = np.matrix(Y) # turn this bad boy into a matrix # Cut off some rows... # B = B[0:10] # Y = Y[0:10] return B, Y
def train(vectors_filename, vocab_filename, vocab_thresh): bigrams_vocab = load_bigrams(vocab_filename, vocab_thresh) word_vectors = word_vecs.load(vectors_filename) n = 50 N = len(bigrams_vocab) # Construct matrix B row-wise print 'Constructing matrix B row-wise' B = np.zeros(shape=(N, 2 * n)) for idx, label in enumerate(bigrams_vocab): b_1 = word_vectors[label.split('_')[0]] b_2 = word_vectors[label.split('_')[1]] row = np.concatenate((b_1, b_2)) B[idx] = row B = np.matrix(B) print 'Computing pseudoinverse' B_inv = np.linalg.pinv(B) # Construct Y Matrix (an individual column from this will make for a y-vector) print 'Constructing matrix Y row-wise' Y = np.zeros(shape=(N, n)) for idx, label in enumerate(bigrams_vocab): row = word_vectors[label] Y[idx] = row Y = np.matrix(Y) # Recover linear approximation matrix A row-wise print 'Recovering linear approximation matrix A row-wise' global A # mark A as global -- we are gonna change it (John: how am I supposed to do this with nice class variables?!) A = np.zeros(shape=(n, 2 * n)) for i in range(n): # 0..n-1 y = Y[:, i] a_i = B_inv * y A[i] = np.transpose(a_i)
def train(vectors_filename, vocab_filename, vocab_thresh): bigrams_vocab = load_bigrams(vocab_filename, vocab_thresh); word_vectors = word_vecs.load(vectors_filename); n = 50 N = len(bigrams_vocab) # Construct matrix B row-wise print 'Constructing matrix B row-wise' B = np.zeros(shape=(N,2*n)) for idx, label in enumerate(bigrams_vocab): b_1 = word_vectors[label.split('_')[0]] b_2 = word_vectors[label.split('_')[1]] row = np.concatenate((b_1,b_2)) B[idx] = row B = np.matrix(B) print 'Computing pseudoinverse' B_inv = np.linalg.pinv(B) # Construct Y Matrix (an individual column from this will make for a y-vector) print 'Constructing matrix Y row-wise' Y = np.zeros(shape=(N,n)) for idx, label in enumerate(bigrams_vocab): row = word_vectors[label] Y[idx] = row Y = np.matrix(Y) # Recover linear approximation matrix A row-wise print 'Recovering linear approximation matrix A row-wise' global A # mark A as global -- we are gonna change it (John: how am I supposed to do this with nice class variables?!) A = np.zeros(shape=(n,2*n)) for i in range(n): # 0..n-1 y = Y[:,i] a_i = B_inv*y A[i] = np.transpose(a_i)
def get_training_data(vectors_filename, vocab_filename, vocab_thresh): bigrams_vocab = load_bigrams(vocab_filename, vocab_thresh); word_vectors = word_vecs.load(vectors_filename); n = 300 N = len(bigrams_vocab) # Construct matrix B row-wise print 'Constructing matrix B row-wise' B = np.zeros(shape=(N,2*n)) for idx, label in enumerate(bigrams_vocab): b_1 = word_vectors[label.split('_')[0]] b_2 = word_vectors[label.split('_')[1]] row = np.concatenate((b_1,b_2)) B[idx] = row # if idx > 5000: # break # B = np.matrix(B) # Construct Y Matrix (an individual column from this will make for a y-vector) print 'Constructing matrix Y row-wise' Y = np.zeros(shape=(N,n)) for idx, label in enumerate(bigrams_vocab): row = word_vectors[label] Y[idx] = row # if idx > 5000: # break Y = np.matrix(Y) # turn this bad boy into a matrix # Cut off some rows... # B = B[0:10] # Y = Y[0:10] return B, Y
#!/usr/bin/env python import numpy as np import load_word_vecs as word_vecs from sklearn.decomposition import PCA # import matplotlib.pyplot as plt # from mpl_toolkits.mplot3d import Axes3D import test_bigrams as tb import matplotlib import pylab as pl import train_neural_network as tnn if __name__ == '__main__': vecs = word_vecs.load("../data/vectors.txt") labels, wordVecsMatrix = word_vecs.get_matrix(vecs) pca = PCA(n_components=2) # pca.fit(wordVecsMatrix); # reduced_X = pca.transform(wordVecsMatrix) # print "Running PCA" # pca = PCA(n_components=2) # pca.fit(wordVecsMatrix); # reduced_X = pca.transform(wordVecsMatrix) # fig = pl.figure() # ax = fig.add_subplot(111, projection='3d') #plot full data
#!/usr/bin/env python import numpy as np import load_word_vecs as word_vecs import bh_tsne.bhtsne as tsne if __name__ == '__main__': labels, wordVecsMatrix = word_vecs.get_matrix(word_vecs.load("../data/vectors.txt")) # #runs tsne on wordVecsMatrix (change if we want to just look at some subset of the bigrams) points = tsne.bh_tsne(wordVecsMatrix); # np.save("../data/tsne_coordinates", points);
#!/usr/bin/env python import numpy as np import load_word_vecs as word_vecs if __name__ == '__main__': vecs = word_vecs.load("../../glove/vectors.txt")
#!/usr/bin/env python import numpy as np import load_word_vecs as word_vecs from sklearn.decomposition import PCA # import matplotlib.pyplot as plt # from mpl_toolkits.mplot3d import Axes3D import test_bigrams as tb import matplotlib import pylab as pl import train_neural_network as tnn if __name__ == '__main__': vecs = word_vecs.load("../data/vectors.txt") labels, wordVecsMatrix = word_vecs.get_matrix(vecs) pca = PCA(n_components=2) # pca.fit(wordVecsMatrix); # reduced_X = pca.transform(wordVecsMatrix) # print "Running PCA" # pca = PCA(n_components=2) # pca.fit(wordVecsMatrix); # reduced_X = pca.transform(wordVecsMatrix) # fig = pl.figure() # ax = fig.add_subplot(111, projection='3d') #plot full data # ax.scatter(reduced_X[:, 0], reduced_X[:, 1], reduced_X[:, 2]) # plt.show()