def testSNLIExample(): """ Test an example actually taken from SNLI dataset on LSTM pipeline. """ start = time.time() table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \ "test_dataStats.json" dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" \ "snli_1.0_test.jsonl" premiseTensor, hypothesisTensor = table.convertDataToEmbeddingTensors( dataJSONFile, dataStats) symPremise = T.ftensor3("inputPremise") symHypothesis = T.ftensor3("inputHypothesis") premiseSent = premiseTensor[:, 0:3, :] hypothesisSent = hypothesisTensor[:, 0:3, :] network = LSTMP2H(numTimestepsPremise=57, numTimestepsHypothesis=30, dimInput=10, embedData="/Users/mihaileric/Documents/Research/" "LSTM-NLI/data/glove.6B.50d.txt.gz") network.printLSTMP2HParams() predictFunc = network.predictFunc(symPremise, symHypothesis) labels = network.predict(premiseSent, hypothesisSent, predictFunc) for l in labels: print "Label: %s" %(l) print "Time for evaluation: %f" %(time.time() - start)
def testSentToIdxMat(): table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") testSent1 = "The cat is blue" idxMat1 = table.convertSentToIdxMatrix(testSent1) print idxMat1 testSent2 = "More dogs are happy" idxMat2 = table.convertSentToIdxMatrix(testSent2) print idxMat2
def testConvertToIdxMatrices(): """ Test conversion of data to embedding idx matrix. """ table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") dataStats= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/dev_dataStats.json" dataJSONFile= "/Users/mihaileric/Documents/Research/LSTM-NLI/data/snli_1.0_dev.jsonl" premiseIdxMatrix, hypothesisIdxMatrix = table.convertDataToIdxMatrices( dataJSONFile, dataStats)
def testConvertIdxMatToIdxTensor(): """ Test conversion from idxMat to IdxTensor. """ table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") idxMat = np.array([[[3], [5], [-1]]]) idxTensor = table.convertIdxMatToIdxTensor(idxMat) idxMat2 = np.zeros((2, 3, 4)) idxMat2.fill(-1) idxTensor2 = table.convertIdxMatToIdxTensor(idxMat2) print 'hi'
def __init__(self, embedData, logPath, trainData, trainDataStats, valData, valDataStats, testData, testDataStats, numTimestepsPremise, numTimestepsHypothesis): self.logger = Logger(log_path=logPath) # All layers in model self.layers = [] self.trainData = trainData self.trainDataStats = trainDataStats self.valData = valData self.valDataStats = valDataStats self.testData = testData self.testDataStats = testDataStats self.numTimestepsPremise = numTimestepsPremise self.numTimestepsHypothesis = numTimestepsHypothesis self.embeddingTable = EmbeddingTable(embedData) # Dimension of word embeddings at input self.dimEmbedding = self.embeddingTable.dimEmbeddings self.numericalParams = {} # Will store the numerical values of the
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense - 1: l_output = DenseLayer( l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, { l_in_prem: x_p, l_in_hyp: x_h }) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format( str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex % (acc_num * batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
import numpy as np import sys import time # Otherwise PyCharm complains sys.path.append("/Users/mihaileric/Documents/Research/keras") from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM from model.embeddings import EmbeddingTable # TODO: Implement softmax in keras as final output layer (activation?) dataPath = "/Users/mihaileric/Documents/Research/LSTM-NLI/data/" embeddingTable = EmbeddingTable(dataPath + "glove.6B.50d.txt.gz") def buildAndEvaluateModel(): start = time.time() testSent = "The cat is blue" idxMat = embeddingTable.convertSentToIdxMatrix(testSent) maxFeatures = 5 # Will iterate over data to compute vocabulary size inputLength = 7 # Will compute the max length of sentences Xtrain = np.random.choice(np.arange(0, maxFeatures), (5, 5)) Xtest = np.random.choice(np.arange(0, maxFeatures), (5, 5)) Ytrain = np.random.choice(np.arange(0, 3), 5) Ytest = np.random.choice(np.arange(0, 3), 5) model = Sequential()
def testIdxListToEmbedList(): table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") idxList = [[1], [4], [8]] print table.convertIdxMatrixToEmbeddingList(idxList)
def testEmbeddings(): table = EmbeddingTable(dataPath+"glove.6B.50d.txt.gz") print table.getEmbeddingFromWord("cat") print table.getEmbeddingFromWord("dog") print table.getEmbeddingFromWord("asssad")