def testStats(): logger = Logger(log_path=logPath) stats = Stats(logger) stats.recordAcc(10, 0.3, "train") stats.recordAcc(20, 0.1, "train") stats.recordAcc(10, 1.3, "dev") stats.recordAcc(40, 0.344, "test") print stats.acc
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense-1: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, {l_in_prem: x_p, l_in_hyp: x_h}) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted(layer_dict, p_metric) cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format(str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex%(acc_num*batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def train(self, numEpochs=1, batchSize=5, learnRateVal=0.1, numExamplesToTrain=-1, gradMax=3., L2regularization=0.0, dropoutRate=0.0, sentenceAttention=False, wordwiseAttention=False): """ Takes care of training model, including propagation of errors and updating of parameters. """ expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \ "wordAttn_{5}".format(str(numEpochs), str(learnRateVal), str(L2regularization), str(dropoutRate), str(sentenceAttention), str(wordwiseAttention)) self.configs.update(locals()) trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.trainData, self.trainDataStats) trainGoldLabel = convertLabelsToMat(self.trainData) valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.valData, self.valDataStats) valGoldLabel = convertLabelsToMat(self.valData) # If you want to train on less than full dataset if numExamplesToTrain > 0: valPremiseIdxMat = valPremiseIdxMat[:, range(numExamplesToTrain), :] valHypothesisIdxMat = valHypothesisIdxMat[:, range(numExamplesToTrain), :] valGoldLabel = valGoldLabel[range(numExamplesToTrain)] #Whether zero-padded on left or right pad = "right" # Get full premise/hypothesis tensors # batchPremiseTensor, batchHypothesisTensor, batchLabels = \ # convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, # self.numTimestepsHypothesis, "right", self.embeddingTable, # valGoldLabel, range(len(valGoldLabel))) #sharedValPremise = theano.shared(batchPremiseTensor) #sharedValHypothesis = theano.shared(batchHypothesisTensor) #sharedValLabels = theano.shared(batchLabels) inputPremise = T.ftensor3(name="inputPremise") inputHypothesis = T.ftensor3(name="inputHypothesis") yTarget = T.fmatrix(name="yTarget") learnRate = T.scalar(name="learnRate", dtype='float32') fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \ fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise, inputHypothesis, yTarget, learnRate, gradMax, L2regularization, dropoutRate, sentenceAttention, wordwiseAttention, batchSize) totalExamples = 0 stats = Stats(self.logger, expName) # Training self.logger.Log("Model configs: {0}".format(self.configs)) self.logger.Log("Starting training with {0} epochs, {1} batchSize," " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate".format( numEpochs, batchSize, learnRateVal, L2regularization, dropoutRate)) predictFunc = self.predictFunc(inputPremise, inputHypothesis, dropoutRate) for epoch in xrange(numEpochs): self.logger.Log("Epoch number: %d" %(epoch)) if numExamplesToTrain > 0: minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize) else: minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize) numExamples = 0 for _, minibatch in minibatches: self.dropoutMode.set_value(1.0) numExamples += len(minibatch) totalExamples += len(minibatch) self.logger.Log("Processed {0} examples in current epoch". format(str(numExamples))) batchPremiseTensor, batchHypothesisTensor, batchLabels = \ convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, self.numTimestepsHypothesis, pad, self.embeddingTable, valGoldLabel, minibatch) gradHypothesisOut = fGradSharedHypothesis(batchPremiseTensor, batchHypothesisTensor, batchLabels) gradPremiseOut = fGradSharedPremise(batchPremiseTensor, batchHypothesisTensor, batchLabels) fUpdatePremise(learnRateVal) fUpdateHypothesis(learnRateVal) predictLabels = self.predict(batchPremiseTensor, batchHypothesisTensor, predictFunc) #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels))) cost = costFn(batchPremiseTensor, batchHypothesisTensor, batchLabels) stats.recordCost(totalExamples, cost) # Note: Big time sink happens here if totalExamples%(100) == 0: # TODO: Don't compute accuracy of dev set self.dropoutMode.set_value(0.0) devAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) stats.recordAcc(totalExamples, devAccuracy, "dev") stats.recordFinalTrainingTime(totalExamples) # Save model to disk self.logger.Log("Saving model...") self.extractParams() configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format(str(batchSize), str(numEpochs), str(learnRateVal), str(self.dimHidden), str(self.dimInput)) self.saveModel(currDir + "/savedmodels/basicLSTM_"+configString+".npz") self.logger.Log("Model saved!") # Set dropout to 0. again for testing self.dropoutMode.set_value(0.0) #Train Accuracy # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat, # trainHypothesisIdxMat, trainGoldLabel, predictFunc) # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy)) # Val Accuracy valAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) # TODO: change -1 for training acc to actual value when I enable train computation stats.recordFinalStats(totalExamples, -1, valAccuracy)
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats, test_data, test_data_stats, log_path, batch_size, num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff): """ Main run function for training model. :param exp_name: :param embed_data: :param train_data: :param train_data_stats: :param val_data: :param val_data_stats: :param test_data: :param test_data_stats: :param log_path: :param batch_size: :param num_epochs: :param unroll_steps: :param learn_rate: :param num_dense: Number of dense fully connected layers to add after concatenation layer :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1 :param penalty: Penalty to use for regularization :param reg_weight: Regularization coeff to use for each layer of network; may want to support different coefficient for different layers :return: """ # Set random seed for deterministic results np.random.seed(0) num_ex_to_train = 30 # Load embedding table table = EmbeddingTable(embed_data) vocab_size = table.sizeVocab dim_embeddings = table.dimEmbeddings embeddings_mat = table.embeddings train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps) val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps) train_labels = convertLabelsToMat(train_data) val_labels = convertLabelsToMat(val_data) # To test for overfitting capabilities of model if num_ex_to_train > 0: val_prem = val_prem[0:num_ex_to_train] val_hyp = val_hyp[0:num_ex_to_train] val_labels = val_labels[0:num_ex_to_train] # Theano expressions for premise/hypothesis inputs to network x_p = T.imatrix() x_h = T.imatrix() target_values = T.fmatrix(name="target_output") # Embedding layer for premise l_in_prem = InputLayer((batch_size, unroll_steps)) l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Embedding layer for hypothesis l_in_hyp = InputLayer((batch_size, unroll_steps)) l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size, output_size=dim_embeddings, W=embeddings_mat) # Ensure embedding matrix parameters are not trainable l_embed_hyp.params[l_embed_hyp.W].remove('trainable') l_embed_prem.params[l_embed_prem.W].remove('trainable') l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp) l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem) # Concatenate sentence embeddings for premise and hypothesis l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum]) l_in = l_concat l_output = l_concat # Add 'num_dense' dense layers with tanh # top layer is softmax if num_dense > 1: for n in range(num_dense): if n == num_dense - 1: l_output = DenseLayer( l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) else: l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh) else: l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax) network_output = get_output(l_output, { l_in_prem: x_p, l_in_hyp: x_h }) # Will have shape (batch_size, 3) f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn') # Compute cost if penalty == "l2": p_metric = l2 elif penalty == "l1": p_metric = l1 layers = lasagne.layers.get_all_layers(l_output) layer_dict = {l: reg_coeff for l in layers} reg_cost = reg_coeff * regularize_layer_params_weighted( layer_dict, p_metric) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost compute_cost = theano.function([x_p, x_h, target_values], cost) # Compute accuracy accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)), dtype=theano.config.floatX) compute_accuracy = theano.function([x_p, x_h, target_values], accuracy) label_output = T.argmax(network_output, axis=-1) predict = theano.function([x_p, x_h], label_output) # Define update/train functions all_params = lasagne.layers.get_all_params(l_output, trainable=True) updates = lasagne.updates.rmsprop(cost, all_params, learn_rate) train = theano.function([x_p, x_h, target_values], cost, updates=updates) # TODO: Augment embedding layer to allow for masking inputs stats = Stats(exp_name) acc_num = 10 #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size) minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size) print("Training ...") try: total_num_ex = 0 for epoch in xrange(num_epochs): for _, minibatch in minibatches: total_num_ex += len(minibatch) stats.log("Processed {0} total examples in epoch {1}".format( str(total_num_ex), str(epoch))) #prem_batch = val_prem[minibatch] #hyp_batch = val_hyp[minibatch] #labels_batch = val_labels[minibatch] prem_batch = train_prem[minibatch] hyp_batch = train_hyp[minibatch] labels_batch = train_labels[minibatch] train(prem_batch, hyp_batch, labels_batch) cost_val = compute_cost(prem_batch, hyp_batch, labels_batch) stats.recordCost(total_num_ex, cost_val) # Periodically compute and log train/dev accuracy if total_num_ex % (acc_num * batch_size) == 0: train_acc = compute_accuracy(train_prem, train_hyp, train_labels) dev_acc = compute_accuracy(val_prem, val_hyp, val_labels) stats.recordAcc(total_num_ex, train_acc, dataset="train") stats.recordAcc(total_num_ex, dev_acc, dataset="dev") except KeyboardInterrupt: pass
def train(self, numEpochs=1, batchSize=5, learnRateVal=0.1, numExamplesToTrain=-1, gradMax=3., L2regularization=0.0, dropoutRate=0.0, sentenceAttention=False, wordwiseAttention=False): """ Takes care of training model, including propagation of errors and updating of parameters. """ expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \ "wordAttn_{5}".format(str(numEpochs), str(learnRateVal), str(L2regularization), str(dropoutRate), str(sentenceAttention), str(wordwiseAttention)) self.configs.update(locals()) trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.trainData, self.trainDataStats) trainGoldLabel = convertLabelsToMat(self.trainData) valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices( self.valData, self.valDataStats) valGoldLabel = convertLabelsToMat(self.valData) # If you want to train on less than full dataset if numExamplesToTrain > 0: valPremiseIdxMat = valPremiseIdxMat[:, range(numExamplesToTrain), :] valHypothesisIdxMat = valHypothesisIdxMat[:, range(numExamplesToTrain ), :] valGoldLabel = valGoldLabel[range(numExamplesToTrain)] #Whether zero-padded on left or right pad = "right" # Get full premise/hypothesis tensors # batchPremiseTensor, batchHypothesisTensor, batchLabels = \ # convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, # self.numTimestepsHypothesis, "right", self.embeddingTable, # valGoldLabel, range(len(valGoldLabel))) #sharedValPremise = theano.shared(batchPremiseTensor) #sharedValHypothesis = theano.shared(batchHypothesisTensor) #sharedValLabels = theano.shared(batchLabels) inputPremise = T.ftensor3(name="inputPremise") inputHypothesis = T.ftensor3(name="inputHypothesis") yTarget = T.fmatrix(name="yTarget") learnRate = T.scalar(name="learnRate", dtype='float32') fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \ fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise, inputHypothesis, yTarget, learnRate, gradMax, L2regularization, dropoutRate, sentenceAttention, wordwiseAttention, batchSize) totalExamples = 0 stats = Stats(self.logger, expName) # Training self.logger.Log("Model configs: {0}".format(self.configs)) self.logger.Log( "Starting training with {0} epochs, {1} batchSize," " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate" .format(numEpochs, batchSize, learnRateVal, L2regularization, dropoutRate)) predictFunc = self.predictFunc(inputPremise, inputHypothesis, dropoutRate) for epoch in xrange(numEpochs): self.logger.Log("Epoch number: %d" % (epoch)) if numExamplesToTrain > 0: minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize) else: minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize) numExamples = 0 for _, minibatch in minibatches: self.dropoutMode.set_value(1.0) numExamples += len(minibatch) totalExamples += len(minibatch) self.logger.Log( "Processed {0} examples in current epoch".format( str(numExamples))) batchPremiseTensor, batchHypothesisTensor, batchLabels = \ convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat, self.numTimestepsHypothesis, pad, self.embeddingTable, valGoldLabel, minibatch) gradHypothesisOut = fGradSharedHypothesis( batchPremiseTensor, batchHypothesisTensor, batchLabels) gradPremiseOut = fGradSharedPremise(batchPremiseTensor, batchHypothesisTensor, batchLabels) fUpdatePremise(learnRateVal) fUpdateHypothesis(learnRateVal) predictLabels = self.predict(batchPremiseTensor, batchHypothesisTensor, predictFunc) #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels))) cost = costFn(batchPremiseTensor, batchHypothesisTensor, batchLabels) stats.recordCost(totalExamples, cost) # Note: Big time sink happens here if totalExamples % (100) == 0: # TODO: Don't compute accuracy of dev set self.dropoutMode.set_value(0.0) devAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) stats.recordAcc(totalExamples, devAccuracy, "dev") stats.recordFinalTrainingTime(totalExamples) # Save model to disk self.logger.Log("Saving model...") self.extractParams() configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format( str(batchSize), str(numEpochs), str(learnRateVal), str(self.dimHidden), str(self.dimInput)) self.saveModel(currDir + "/savedmodels/basicLSTM_" + configString + ".npz") self.logger.Log("Model saved!") # Set dropout to 0. again for testing self.dropoutMode.set_value(0.0) #Train Accuracy # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat, # trainHypothesisIdxMat, trainGoldLabel, predictFunc) # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy)) # Val Accuracy valAccuracy = self.computeAccuracy(valPremiseIdxMat, valHypothesisIdxMat, valGoldLabel, predictFunc) # TODO: change -1 for training acc to actual value when I enable train computation stats.recordFinalStats(totalExamples, -1, valAccuracy)