def test(): energies_var = T.tensor4('energies', dtype=theano.config.floatX) targets_var = T.imatrix('targets') masks_var = T.matrix('masks', dtype=theano.config.floatX) layer_input = lasagne.layers.InputLayer([2, 2, 3, 3], input_var=energies_var) out = lasagne.layers.get_output(layer_input) loss = crf_loss(out, targets_var, masks_var) prediction, acc = crf_accuracy(energies_var, targets_var) fn = theano.function([energies_var, targets_var, masks_var], [loss, prediction, acc]) energies = np.array([[[[10, 15, 20], [5, 10, 15], [3, 2, 0]], [[5, 10, 1], [5, 10, 1], [5, 10, 1]]], [[[5, 6, 7], [2, 3, 4], [2, 1, 0]], [[0, 0, 0], [0, 0, 0], [0, 0, 0]]]], dtype=np.float32) targets = np.array([[0, 1], [0, 2]], dtype=np.int32) masks = np.array([[1, 1], [1, 0]], dtype=np.float32) l, p, a = fn(energies, targets, masks) print l print p print a
def loss_from_embedding(char_emb, word_emb, deterministic=False, return_all=True): # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies = Lyrs.get_output(bilstm_bilstm_crf, inputs={ char_in_layer: char_emb, word_in_layer: word_emb }, deterministic=deterministic) loss = crf_loss(energies, target_var, mask_var).mean() if return_all: predict, corr = crf_accuracy(energies, target_var) corr = (corr * mask_var).sum(dtype=theano.config.floatX) return loss, predict, corr else: return loss
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 50 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) logger.info('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log #sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) #sys.stdout.write(log_info) num_back = len(log_info) logger.info(log_info) # update training log after each epoch assert train_inst == num_data # sys.stdout.write("\b" * num_back) # print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( # min(train_batches * batch_size, num_data), num_data, # train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) logger.info('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)) #evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp3/dev%d' % epoch, label_alphabet, is_flattened=False) # print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) logger.info('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)) logger.info( 'dev_err: %.4f, best_loss: %.4f, best_acc: %.4f, dev_corr: %.4f, dev_total: %.4f, (dev_corr/dev_total): %.4f' % (dev_err, best_loss, best_acc, dev_corr, dev_total, dev_corr / dev_total)) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # # evaluate on test data when better performance detected # test_err = 0.0 # test_corr = 0.0 # test_total = 0 # test_inst = 0 # for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, # batch_size=batch_size): # inputs, targets, masks, char_inputs = batch # err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # test_err += err * inputs.shape[0] # test_corr += corr # test_total += num # test_inst += inputs.shape[0] # if output_predict: # utils.output_predictions(predictions, targets, masks, 'tmp3/test%d' % epoch, label_alphabet, # is_flattened=False) # # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) # logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)) # if update_loss: # best_loss_test_err = test_err # best_loss_test_corr = test_corr # if update_acc: # best_acc_test_err = test_err # best_acc_test_corr = test_corr logger.info('stop_count: %.4f' % (stop_count)) # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp4/test%d' % epoch, label_alphabet, is_flattened=False) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)) # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( # best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total) logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total))
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet, is_flattened=False) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) if best_loss < dev_err and best_acc > dev_corr / dev_total: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < dev_corr / dev_total: update_acc = True best_acc = dev_corr / dev_total best_epoch_acc = epoch # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet, is_flattened=False) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total) if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', default='true', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" parser.add_argument("--model") # model name args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout modelname = args.model # 读取数据训练集,dev集,测试集 X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ embedd_table, label_alphabet, word_alphabet, \ C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) print 'label_alphabet' for i in range(label_alphabet.size()): print i print label_alphabet.get_instance(i) # print Y_test, Y_test.shape; sys.exit(1) my_size = data_processor.MAX_LENGTH_TRAIN my_size = data_processor.MY_MAX_LENGTH print "\tMY_SIZE", my_size, data_processor.MAX_LENGTH_TRAIN # my_size = data_processor.MAX_LENGTH_DEV print "\tMYSIZE", my_size num_labels = label_alphabet.size() - 1 # 构建网络 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # 构建输入层 # construct input and mask layers logger.info("construct input and mask layers...") layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn logger.info("construct bi-rnn-cnn...") num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() # print loss_train; sys.exit(1) loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 # num_epochs = 1 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err += err * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) num_back = len(log_info) # update training log after each epoch assert train_inst == num_data sys.stdout.write("\b" * num_back) print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err / num_data, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 test_err_sentence = 0 my_f1 = {} my_prs = [] my_trs = [] for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # print "-->HERE COMES THE PREDS",predictions # print predictions.shape,type(predictions); for i in xrange(batch_size): try: input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])] except IndexError: continue if len(input_clear) == 0: continue try: dev_size = input_clear.index(None) except ValueError: dev_size = my_size # print dev_size my_trs += list(targets[i, :dev_size]) my_prs += list(predictions[i, :dev_size]) # for j in xrange(len(targets[i,:dev_size])): # pr = predictions[i,j] # tr = targets[i,j] # my_f1[(pr,tr)] = my_f1.get((pr,tr),0)+1 # input_clear = [word_alphabet.get_instance(y) for y in list(inputs[0,:])] # print input_clear # my_f1 = f1_score(my_trs,my_prs,average="macro") # print [label_alphabet.get_instance(y+1) for y in list(targets[0,:])] # print targets[0,:],predictions[0,:],inputs.shape[0],my_f1; sys.exit(1) # print err,inputs.shape[0] dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet, is_flattened=False) dev_f1 = f1_score(my_trs, my_prs, average="macro") classify_report = metrics.classification_report(my_trs, my_prs) print 'dev classify_report' print classify_report print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%, f1: %.4f' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total, dev_f1) # CHANGE THIS IF NECESSARY # MODEL SELECTION ON DEV CRITERION, SE useF1 = True useLoss = False criterion = dev_f1 if useF1 else dev_corr / dev_total if best_loss < dev_err and best_acc > criterion: stop_count += 1 else: update_loss = False update_acc = False stop_count = 0 if best_loss > dev_err: update_loss = True best_loss = dev_err best_epoch_loss = epoch if best_acc < criterion: update_acc = True best_acc = criterion best_epoch_acc = epoch else: if useLoss == False: continue # evaluate on test data when better performance detected test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 test_err_sentence = 0 test_sentences = 0 print "#SAVING MODEL" np.savez(modelname, *lasagne.layers.get_all_param_values(bi_lstm_cnn_crf)) test_prs = [] test_trs = [] for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) # print "-->HERE COMES THE PREDS",predictions # print predictions.shape,type(predictions); for i in xrange(batch_size): try: input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])] except IndexError: continue if len(input_clear) == 0: continue try: test_size = input_clear.index(None) except ValueError: test_size = my_size # print dev_size test_trs += list(targets[i, :test_size]) test_prs += list(predictions[i, :test_size]) # print predictions # SE # print "AAA",inputs[0],len(inputs[0]),inputs[0][0] input_clear = [word_alphabet.get_instance(y) for x in inputs for y in list(x)] # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE target_clear = [label_alphabet.get_instance(y + 1) for x in targets for y in list(x)] target_clear_pred = [label_alphabet.get_instance(y + 1) for x in predictions for y in list(x)] # SE # print my_size # comment this out # my_size = 652 # print input_clear; sys.exit(1) # print input_clear for ii in range(batch_size): Z = input_clear[ii * my_size:(ii + 1) * my_size] if len(Z) == 0: continue try: size = Z.index(None) except ValueError: size = my_size # print size itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size] EMPTY = "EMPTY" EMPTY = "EMPTY_EMPTY" otruth = filter(lambda z: z != EMPTY, target_clear[ii * my_size:(ii + 1) * my_size][:size]) opred = filter(lambda z: z != EMPTY, target_clear_pred[ii * my_size:(ii + 1) * my_size][:size]) if otruth == opred: test_err_sentence += 1 # print "CORRECT",itruth,otruth,opred print "#CORRECT %%%", len(itruth), len(opred), len(otruth) printout(itruth, opred, otruth) print else: print "#WRONG %%%" # ,itruth,otruth,opred printout(itruth, opred, otruth) print test_sentences += 1 test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet, is_flattened=False) test_f1 = f1_score(test_trs, test_prs, average="macro") test_classify_report = metrics.classification_report(test_trs, test_prs) print 'label_alphabet' for i in range(label_alphabet.size()): print i print label_alphabet.get_instance(i) print 'Epoch %d test classify_report' % epoch print test_classify_report print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%% f1: %.4f' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total, test_f1), test_err_sentence * 1.0 / test_sentences, test_err_sentence, test_sentences if update_loss: best_loss_test_err = test_err best_loss_test_corr = test_corr if update_acc: best_acc_test_err = test_err best_acc_test_corr = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo != 'adadelta': lr = learning_rate / (1.0 + epoch * decay_rate) updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # print best performance on test data. logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total) logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument( '--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument( '--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument( '--test') # "data/POS-penn/wsj/split1/wsj1.test.original" parser.add_argument('--realtest') parser.add_argument('--mymodel') args = parser.parse_args() def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = lasagne.layers.EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, W=embedd_table, name='embedding') return layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') return layer_input def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) return layer_char_input logger = utils.get_logger("BiLSTM-CNN-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test real_test_path = args.realtest update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes num_filters = args.num_filters gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout mymodel = args.mymodel print "Model is", mymodel, test_path, real_test_path X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, _X_real_test, _Y_real_test, _mask_real_test, \ embedd_table, label_alphabet, word_alphabet, \ C_train, C_dev, C_test, _C_real_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, test_path, oov=oov, fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path, use_character=True) _X_train, _Y_train, _mask_train, _X_dev, _Y_dev, _mask_dev, _X_test, _Y_test, _mask_test, X_real_test, Y_real_test, mask_real_test, \ _embedd_table, _label_alphabet, _word_alphabet, \ _C_train, _C_dev, _C_test, C_real_test, _char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, real_test_path, oov=oov,fine_tune=fine_tune, embedding=embedding, embedding_path=embedding_path,use_character=True) #print _C_train.shape,_C_dev.shape,_C_test.shape,C_real_test.shape; #sys.exit(1) my_size = data_processor.MAX_LENGTH_TRAIN my_size = data_processor.MY_MAX_LENGTH #my_size = data_processor.MAX_LENGTH_DEV print "\tMYSIZE", my_size, C_real_test.shape, C_test.shape, C_train.shape num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # construct input and mask layers layer_incoming1 = construct_char_input_layer() layer_incoming2 = construct_input_layer() layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bi-rnn-cnn num_units = args.num_units bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) # bi_lstm_cnn_crf = None logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) # compute loss num_tokens = mask_var.sum(dtype=theano.config.floatX) # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf) energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True) loss_train = crf_loss(energies_train, target_var, mask_var).mean() loss_eval = crf_loss(energies_eval, target_var, mask_var).mean() # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bi_lstm_cnn_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty _, corr_train = crf_accuracy(energies_train, target_var) corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX) prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var) corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX) # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) my_prediction_eval = my_crf_accuracy(energies_eval) my_eval_fn = theano.function([input_var, mask_var, char_input_var], [my_prediction_eval]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_loss = 1e+12 best_acc = 0.0 best_epoch_loss = 0 best_epoch_acc = 0 best_loss_test_err = 0. best_loss_test_corr = 0. best_acc_test_err = 0. best_acc_test_corr = 0. stop_count = 0 lr = learning_rate patience = args.patience print "#LOADING MODEL" #np.savez("model.npz",*lasagne.layers.get_all_param_values(bi_lstm_cnn_crf)) # just load the data, see here: # https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py #try: mymodel = sys.argv[1] #except IndexError: # mymodel = "models.npz" with np.load(mymodel) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(bi_lstm_cnn_crf, param_values) correct = 0 total = 0 print dir(bi_lstm_cnn_crf) #print bi_lstm_cnn_crf.predict([1,2,3,4]) #sys.exit(1) print X_real_test.shape, Y_real_test.shape, C_real_test.shape, mask_real_test.shape # that's a stupid hack #C_real_test = C_real_test[:len(X_real_test)] #print X_real_test[0:1] #print my_eval_fn(X_real_test[0:1],mask_real_test[0:1],C_real_test[0:1]) #sys.exit(1) for batch in utils.iterate_minibatches(X_real_test, Y_real_test, masks=mask_real_test, char_inputs=C_real_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch #print inputs,targets,masks,char_inputs; sys.exit(1) err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) #print predictions # SE input_clear = [ word_alphabet.get_instance(y) for x in inputs for y in list(x) ] # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE target_clear = [ label_alphabet.get_instance(y + 1) for x in targets for y in list(x) ] target_clear_pred = [ label_alphabet.get_instance(y + 1) for x in predictions for y in list(x) ] # SE #print my_size # comment this out #my_size = 557 #print input_clear; sys.exit(1) for ii in range(batch_size): Z = input_clear[ii * my_size:(ii + 1) * my_size] if len(Z) == 0: continue try: size = Z.index(None) except ValueError: size = my_size #print size itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size] otruth = filter( lambda z: z != "EMPTY", target_clear[ii * my_size:(ii + 1) * my_size][:size]) opred = filter( lambda z: z != "EMPTY", target_clear_pred[ii * my_size:(ii + 1) * my_size][:size]) total += len(opred) correct += len( filter(lambda x: x == True, [otruth[jj] == opred[jj] for jj in xrange(len(opred))])) if otruth == opred: #test_err_sentence += 1 #print "CORRECT",itruth,otruth,opred #print "#CORRECT %%%" printout(itruth, opred, otruth) print else: #print "#WRONG %%%" #itruth,otruth,opred printout(itruth, opred, otruth) print print correct, total, correct * 1.0 / total