Example #1
0
    def __init__(self, name, keep_growing=True):
        self.__name = name

        self.instance2index = {}
        self.instances = []
        self.keep_growing = keep_growing

        # Index 0 is occupied by default, all else following.
        self.default_index = 0
        self.next_index = 1

        self.logger = utils.get_logger('Alphabet')
Example #2
0
    def __init__(self, name, keep_growing=True):
        self.__name = name

        self.instance2index = {}
        self.instances = []
        self.keep_growing = keep_growing

        # Index 0 is occupied by default, all else following.
        self.default_index = 0
        self.next_index = 1

        self.logger = utils.get_logger('Alphabet')
Example #3
0
    def __init__(self, name, keep_growing=True):
        self.__name = name

        self.instance2index = {}
        self.instances = []
        self.vocab_freqs = [0]  # key: index, value: frequency
        # initial: <unknown> 0 freq

        self.keep_growing = keep_growing

        # Index 0 is occupied by default, all else following.
        self.default_index = 0  # this is for <unknown>
        self.next_index = 1

        self.logger = utils.get_logger('Alphabet')
Example #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune the word embeddings')
    parser.add_argument('--embedding',
                        choices=['word2vec', 'glove', 'senna', 'random'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict',
                        default=None,
                        help='path for embedding dict')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters',
                        type=int,
                        default=20,
                        help='Number of filters in CNN')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--gamma',
                        type=float,
                        default=1e-6,
                        help='weight for regularization')
    parser.add_argument('--peepholes',
                        action='store_true',
                        help='Peepholes for LSTM')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help='Embedding for oov word',
                        required=True)
    parser.add_argument('--update',
                        choices=['sgd', 'momentum', 'nesterov', 'adadelta'],
                        help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular',
                        choices=['none', 'l2'],
                        help='regularization for training',
                        required=True)
    parser.add_argument('--dropout',
                        action='store_true',
                        help='Apply dropout layers')
    parser.add_argument('--patience',
                        type=int,
                        default=5,
                        help='Patience for early stopping')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length),
                                                    input_var=input_var,
                                                    name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(
                layer_input,
                input_size=alphabet_size,
                output_size=embedd_dim,
                W=embedd_table,
                name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length,
                                                           embedd_dim),
                                                    input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None,
                                                            max_sent_length,
                                                            max_char_length),
                                                     input_var=char_input_var,
                                                     name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                          pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-CNN-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, \
    C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path, oov=oov,
                                                                                              fine_tune=fine_tune,
                                                                                              embedding=embedding,
                                                                                              embedding_path=embedding_path,
                                                                                              use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # construct input and mask layers
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var,
                                           name='mask')

    # construct bi-rnn-cnn
    num_units = args.num_units

    bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1,
                                           layer_incoming2,
                                           num_units,
                                           num_labels,
                                           mask=layer_mask,
                                           grad_clipping=grad_clipping,
                                           peepholes=peepholes,
                                           num_filters=num_filters,
                                           dropout=dropout)

    logger.info("Network structure: hidden=%d, filter=%d" %
                (num_units, num_filters))

    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf)
    energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf,
                                              deterministic=True)

    loss_train = crf_loss(energies_train, target_var, mask_var).mean()
    loss_eval = crf_loss(energies_eval, target_var, mask_var).mean()
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            bi_lstm_cnn_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = crf_accuracy(energies_train, target_var)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True)
    updates = utils.create_updates(loss_train,
                                   params,
                                   update_algo,
                                   learning_rate,
                                   momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_train, corr_train, num_tokens],
        updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_eval, corr_eval, num_tokens, prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 50
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr,
                                                                    decay_rate)
        logger.info('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' %
                    (epoch, lr, decay_rate))
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train,
                                               Y_train,
                                               masks=mask_train,
                                               char_inputs=C_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            #sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data, train_err
                / train_inst, train_corr * 100 / train_total, time_left)
            #sys.stdout.write(log_info)
            num_back = len(log_info)
            logger.info(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        # sys.stdout.write("\b" * num_back)
        # print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
        #     min(train_batches * batch_size, num_data), num_data,
        #     train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)
        logger.info('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' %
                    (min(train_batches * batch_size,
                         num_data), num_data, train_err / num_data,
                     train_corr * 100 / train_total, time.time() - start_time))

        #evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        for batch in utils.iterate_minibatches(X_dev,
                                               Y_dev,
                                               masks=mask_dev,
                                               char_inputs=C_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         'tmp3/dev%d' % epoch,
                                         label_alphabet,
                                         is_flattened=False)

        # print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        #     dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)
        logger.info('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                    (dev_err / dev_inst, dev_corr, dev_total,
                     dev_corr * 100 / dev_total))
        logger.info(
            'dev_err: %.4f, best_loss: %.4f, best_acc: %.4f, dev_corr: %.4f, dev_total: %.4f, (dev_corr/dev_total): %.4f'
            % (dev_err, best_loss, best_acc, dev_corr, dev_total,
               dev_corr / dev_total))

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # # evaluate on test data when better performance detected
            # test_err = 0.0
            # test_corr = 0.0
            # test_total = 0
            # test_inst = 0
            # for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test,
            #                                        batch_size=batch_size):
            #     inputs, targets, masks, char_inputs = batch
            #     err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)
            #     test_err += err * inputs.shape[0]
            #     test_corr += corr
            #     test_total += num
            #     test_inst += inputs.shape[0]
            #     if output_predict:
            #         utils.output_predictions(predictions, targets, masks, 'tmp3/test%d' % epoch, label_alphabet,
            #                                  is_flattened=False)

            # # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            # #     test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)
            # logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            #     test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total))
            # if update_loss:
            #     best_loss_test_err = test_err
            #     best_loss_test_corr = test_corr
            # if update_acc:
            #     best_acc_test_err = test_err
            #     best_acc_test_corr = test_corr

        logger.info('stop_count: %.4f' % (stop_count))
        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train,
                                           params,
                                           update_algo,
                                           lr,
                                           momentum=momentum)
            train_fn = theano.function(
                [input_var, target_var, mask_var, char_input_var],
                [loss_train, corr_train, num_tokens],
                updates=updates)

    # evaluate on test data when better performance detected
    test_err = 0.0
    test_corr = 0.0
    test_total = 0
    test_inst = 0
    for batch in utils.iterate_minibatches(X_test,
                                           Y_test,
                                           masks=mask_test,
                                           char_inputs=C_test,
                                           batch_size=batch_size):
        inputs, targets, masks, char_inputs = batch
        err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                              char_inputs)
        test_err += err * inputs.shape[0]
        test_corr += corr
        test_total += num
        test_inst += inputs.shape[0]
        if output_predict:
            utils.output_predictions(predictions,
                                     targets,
                                     masks,
                                     'tmp4/test%d' % epoch,
                                     label_alphabet,
                                     is_flattened=False)

    # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
    #     test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)
    logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                (test_err / test_inst, test_corr, test_total,
                 test_corr * 100 / test_total))
    if update_loss:
        best_loss_test_err = test_err
        best_loss_test_corr = test_corr
    if update_acc:
        best_acc_test_err = test_err
        best_acc_test_corr = test_corr

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" %
                best_epoch_loss)
    logger.info("final best acc test performance (at epoch %d)" %
                best_epoch_acc)

    # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
    #     best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                (best_loss_test_err / test_inst, best_loss_test_corr,
                 test_total, best_loss_test_corr * 100 / test_total))

    # print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
    #     best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
    logger.info('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                (best_acc_test_err / test_inst, best_acc_test_corr, test_total,
                 best_acc_test_corr * 100 / test_total))
Example #5
0
def main():
    parser = argparse.ArgumentParser(description='dropout experiments on mnist')
    parser.add_argument('--num_epochs', type=int, default=1000, help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=64, help='Number of instances in each batch')
    parser.add_argument('--learning_rate', type=float, default=0.01, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.01, help='Decay rate of learning rate')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for L-norm regularization')
    parser.add_argument('--delta', type=float, default=0.0, help='weight for expectation-linear regularization')
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')

    args = parser.parse_args()

    logger = utils.get_logger("CIFAR-100")
    regular = args.regular
    update_algo = args.update
    gamma = args.gamma
    delta = args.delta

    # Load the dataset
    logger.info("Loading data...")
    X_train, y_train, X_test, y_test = load_dataset_wo_val()
    num_data, _, _, _ = X_train.shape

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    logger.info("Building model and compiling functions...")
    network = build_dnn(input_var=input_var)

    # get prediction
    prediction_train = lasagne.layers.get_output(network)
    prediction_train_det = lasagne.layers.get_output(network, deterministic=True)
    prediction_eval = lasagne.layers.get_output(network, deterministic=True)

    # compute loss
    loss_train_org = lasagne.objectives.categorical_crossentropy(prediction_train, target_var)
    loss_train_org = loss_train_org.mean()

    loss_train_expect_linear = lasagne.objectives.squared_error(prediction_train, prediction_train_det)
    loss_train_expect_linear = loss_train_expect_linear.sum(axis=1)
    loss_train_expect_linear = loss_train_expect_linear.mean()

    loss_train = loss_train_org + delta * loss_train_expect_linear
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(network, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var)
    loss_eval = loss_eval.mean()

    # calculate number of correct labels
    corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var)
    corr_train = corr_train.sum(dtype=theano.config.floatX)

    corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var)
    corr_eval = corr_eval.sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)
    params_constraint = utils.get_all_params_by_name(network,
                                                     name=['cnn1.W', 'cnn2.W', 'cnn3.W', 'dense1.W', 'dense2.W'])
    assert len(params_constraint) == 5
    for param in params_constraint:
        updates[param] = lasagne.updates.norm_constraint(updates[param], max_norm=4.0)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var],
                               [loss_train, loss_train_org, loss_train_expect_linear, corr_train], updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var], [loss_eval, corr_eval])

    logger.info(
        "Start training: %s with regularization: %s(%f) (#epoch: %d, #training data: %d, batch size: %d, delta: %f)..." \
        % (update_algo, regular, (0.0 if regular == 'none' else gamma), num_epochs, num_data, batch_size, delta))

    num_batches = num_data / batch_size
    lr = learning_rate
    patience = args.patience
    best_test_epoch = 0
    best_test_err = 0.
    best_test_corr = 0.
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_err_org = 0.0
        train_err_linear = 0.0
        train_corr = 0.0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
            inputs, targets = batch
            err, err_org, err_linear, corr = train_fn(inputs, targets)
            train_err += err * inputs.shape[0]
            train_err_org += err_org * inputs.shape[0]
            train_err_linear += err_linear * inputs.shape[0]
            train_corr += corr
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_inst, train_err_org / train_inst, train_err_linear / train_inst,
                train_corr * 100 / train_inst, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, loss_org: %.4f, loss_linear: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / num_data, train_err_org / num_data, train_err_linear / num_data, train_corr * 100 / num_data,
            time.time() - start_time)

        # evaluate on test data
        test_err = 0.0
        test_corr = 0.0
        test_inst = 0
        for batch in iterate_minibatches(X_test, y_test, batch_size):
            inputs, targets = batch
            err, corr = eval_fn(inputs, targets)
            test_err += err * inputs.shape[0]
            test_corr += corr
            test_inst += inputs.shape[0]

        if best_test_corr < test_corr:
            best_test_epoch = epoch
            best_test_corr = test_corr
            best_test_err = test_err

        print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            test_err / test_inst, test_corr, test_inst, test_corr * 100 / test_inst)
        print 'best test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            best_test_err / test_inst, best_test_corr, test_inst, best_test_corr * 100 / test_inst)

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
            params_constraint = utils.get_all_params_by_name(network,
                                                             name=['cnn1.W', 'cnn2.W', 'cnn3.W', 'dense1.W', 'dense2.W'])
            assert len(params_constraint) == 5
            for param in params_constraint:
                updates[param] = lasagne.updates.norm_constraint(updates[param], max_norm=4.0)

            train_fn = theano.function([input_var, target_var],
                                       [loss_train, loss_train_org, loss_train_expect_linear, corr_train],
                                       updates=updates)

    # print last and best performance on test data.
    logger.info("final test performance (at epoch %d)" % num_epochs)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        test_err / test_inst, test_corr, test_inst, test_corr * 100 / test_inst)
    logger.info("final best acc test performance (at epoch %d)" % best_test_epoch)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_test_err / test_inst, best_test_corr, test_inst, best_test_corr * 100 / test_inst)
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune the word embeddings')
    parser.add_argument(
        '--embedding',
        choices=['word2vec', 'glove', 'senna', 'random', 'polyglot'],
        help='Embedding for words',
        required=True)
    parser.add_argument('--embedding_dict',
                        default=None,
                        help='path for embedding dict')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters',
                        type=int,
                        default=20,
                        help='Number of filters in CNN')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--gamma',
                        type=float,
                        default=1e-6,
                        help='weight for regularization')
    parser.add_argument('--peepholes',
                        action='store_true',
                        help='Peepholes for LSTM')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help='Embedding for oov word',
                        required=True)
    parser.add_argument(
        '--update',
        choices=['sgd', 'momentum', 'nesterov', 'adadelta', 'adam'],
        help='update algorithm',
        default='sgd')
    parser.add_argument('--regular',
                        choices=['none', 'l2'],
                        help='regularization for training',
                        required=True)
    parser.add_argument('--dropout',
                        action='store_true',
                        help='Apply dropout layers')
    parser.add_argument('--patience',
                        type=int,
                        default=5,
                        help='Patience for early stopping')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument('--train')
    parser.add_argument('--dev')
    parser.add_argument('--test')
    parser.add_argument('--exp_dir')
    parser.add_argument('--adv', type=float, default=0)
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--reload', default=None, help='path for reloading')

    args = parser.parse_args()
    np.random.seed(args.seed)
    lasagne.random.set_rng(np.random)

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length),
                                                    input_var=input_var,
                                                    name='input')
            layer_embedding = Normalized_EmbeddingLayer(
                layer_input,
                input_size=alphabet_size,
                output_size=embedd_dim,
                vocab_freqs=word_freqs,
                W=embedd_table,
                name='embedding')
            raw_layer = layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length,
                                                           embedd_dim),
                                                    input_var=input_var,
                                                    name='input')
            raw_layer = layer_input

        return raw_layer  # [batch, max_sent_length, embedd_dim]

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None,
                                                            max_sent_length,
                                                            max_char_length),
                                                     input_var=char_input_var,
                                                     name='char-input')
        layer_char_input = lasagne.layers.reshape(
            layer_char_input,
            (-1, [2]))  # [batch * max_sent_length, max_char_length]
        layer_char_embedding = Normalized_EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet_size,
            output_size=char_embedd_dim,
            vocab_freqs=char_freqs,
            W=char_embedd_table,
            name='char_embedding'
        )  # [n_examples, max_char_length, char_embedd_dim]

        #layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # [n_examples, char_embedd_dim, max_char_length]
        return layer_char_embedding

    logger = utils.get_logger("BiLSTM-BiLSTM-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    exp_dir = args.exp_dir
    if not os.path.isdir(exp_dir): os.mkdir(exp_dir)
    exp_name = exp_dir.split('/')[-1]
    exp_mode = exp_name.split('_')[0]  # 'pos' or 'ner', etc.

    save_dir = os.path.join(exp_dir, 'save')
    eval_dir = os.path.join(exp_dir, 'eval')
    if not os.path.isdir(save_dir): os.mkdir(save_dir)
    if not os.path.isdir(eval_dir): os.mkdir(eval_dir)
    eval_script = "./conlleval"

    if exp_mode == 'pos':
        (word_col_in_data, label_col_in_data) = (0, 1)
    elif exp_mode == 'ner':
        (word_col_in_data, label_col_in_data) = (0, 3)
    elif exp_mode == 'chunk':
        (word_col_in_data, label_col_in_data) = (0, 2)
    else:
        (word_col_in_data, label_col_in_data) = (1, 3)  # assume CoNLL-U style

    # load data
    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    (embedd_table, word_freqs), label_alphabet, \
    C_train, C_dev, C_test, (char_embedd_table, char_freqs) = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                test_path, word_col_in_data, label_col_in_data,
                label_name=exp_mode, oov=oov,
                fine_tune=True,
                embedding=embedding, embedding_path=embedding_path,
                use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    num_tokens = mask_var.sum(dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # prepare initial input layer and embeddings
    char_layer = construct_char_input_layer()
    word_layer = construct_input_layer()
    char_emb = Lyrs.get_output(char_layer)
    word_emb = Lyrs.get_output(word_layer)

    # construct input and mask layers
    char_in_layer = Lyrs.InputLayer(shape=(None, max_char_length,
                                           char_embedd_dim))
    word_in_layer = Lyrs.InputLayer(shape=(None, max_length, embedd_dim))

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var,
                                           name='mask')

    # construct bilstm_bilstm_crf
    num_units = args.num_units
    num_filters = args.num_filters
    logger.info("Network structure: hidden=%d, filter=%d" %
                (num_units, num_filters))

    bilstm_bilstm_crf = build_BiLSTM_BiLSTM_CRF(char_in_layer,
                                                word_in_layer,
                                                num_units,
                                                num_labels,
                                                mask=layer_mask,
                                                grad_clipping=grad_clipping,
                                                peepholes=peepholes,
                                                num_filters=num_filters,
                                                dropout=dropout)

    # compute loss
    def loss_from_embedding(char_emb,
                            word_emb,
                            deterministic=False,
                            return_all=True):
        # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
        energies = Lyrs.get_output(bilstm_bilstm_crf,
                                   inputs={
                                       char_in_layer: char_emb,
                                       word_in_layer: word_emb
                                   },
                                   deterministic=deterministic)
        loss = crf_loss(energies, target_var, mask_var).mean()
        if return_all:
            predict, corr = crf_accuracy(energies, target_var)
            corr = (corr * mask_var).sum(dtype=theano.config.floatX)
            return loss, predict, corr
        else:
            return loss

    loss_eval, prediction_eval, corr_eval = loss_from_embedding(
        char_emb, word_emb, deterministic=True)
    loss_train_ori, _, corr_train = loss_from_embedding(char_emb, word_emb)

    if args.adv:
        logger.info('Preparing adversarial training...')
        loss_train_adv = adversarial_loss(char_emb,
                                          word_emb,
                                          loss_from_embedding,
                                          loss_train_ori,
                                          perturb_scale=args.adv)
        loss_train = (loss_train_ori + loss_train_adv) / 2.0
    else:
        loss_train_adv = T.as_tensor_variable(
            np.asarray(0.0, dtype=theano.config.floatX))
        loss_train = loss_train_ori + loss_train_adv

    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            bilstm_bilstm_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = Lyrs.get_all_params(
        bilstm_bilstm_crf, trainable=True) + Lyrs.get_all_params(
            char_layer, trainable=True) + Lyrs.get_all_params(word_layer,
                                                              trainable=True)
    updates = utils.create_updates(loss_train,
                                   params,
                                   update_algo,
                                   learning_rate,
                                   momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_train_ori, loss_train_adv, corr_train, num_tokens],
        updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_eval, corr_eval, num_tokens, prediction_eval])

    # reload saved model
    if args.reload is not None:
        logger.info('Reloading saved parameters from %s ...\n' % args.reload)
        with np.load(args.reload) as f:
            param_values = [f['arr_%d' % j] for j in range(len(f.files))]
        Lyrs.set_all_param_values(word_layer, param_values[0:1])
        Lyrs.set_all_param_values(char_layer, param_values[1:2])
        Lyrs.set_all_param_values(bilstm_bilstm_crf, param_values[2:])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s) ..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_acc = np.array([0.0, 0.0, 0.0])
    best_epoch_acc = np.array([0, 0, 0])
    best_acc_test_err = np.array([0.0, 0.0, 0.0])
    best_acc_test_corr = np.array([0.0, 0.0, 0.0])
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print
        print 'Epoch %d (learning rate=%.7f, decay rate=%.4f): ' % (epoch, lr,
                                                                    decay_rate)
        train_err_ori = 0.0
        train_err_adv = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0

        epoch_save_dir = os.path.join(save_dir, 'epoch%d' % epoch)
        os.mkdir(epoch_save_dir)

        for batch in utils.iterate_minibatches(X_train,
                                               Y_train,
                                               masks=mask_train,
                                               char_inputs=C_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err_ori, err_adv, corr, num = train_fn(inputs, targets, masks,
                                                   char_inputs)
            train_err_ori += err_ori * inputs.shape[0]
            train_err_adv += err_adv * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            if train_batches % (num_batches // 10) == 0:
                log_info = 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time left: %.2fs\n' % (
                    min(train_batches * batch_size, num_data), num_data,
                    train_err_ori / train_inst, train_err_adv / train_inst,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()

                # save the parameter values
                #param_values = Lyrs.get_all_param_values(bilstm_bilstm_crf)
                #np.savez(epoch_save_dir + '/iter%d.npz' % train_batches, *param_values)

        # save the parameter values
        param_values = Lyrs.get_all_param_values(
            word_layer) + Lyrs.get_all_param_values(
                char_layer) + Lyrs.get_all_param_values(bilstm_bilstm_crf)
        np.savez(epoch_save_dir + '/final.npz', *param_values)

        # update training log after each epoch
        assert train_inst == num_data
        print 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err_ori / train_inst, train_err_adv / train_inst,
            train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        for batch in utils.iterate_minibatches(X_dev,
                                               Y_dev,
                                               masks=mask_dev,
                                               char_inputs=C_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                output_file = eval_dir + '/dev%d' % epoch
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         output_file,
                                         label_alphabet,
                                         is_flattened=False)

        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_inst, dev_corr, dev_total,
            dev_corr * 100 / dev_total)

        #update_loss = False
        update_acc = False
        if best_acc.min() > dev_corr / dev_total:
            stop_count += 1
        else:
            stop_count = 0
            if best_acc.min() < dev_corr / dev_total:
                update_acc = True
                idx_to_update = best_acc.argmin()
                best_acc[idx_to_update] = dev_corr / dev_total
                best_epoch_acc[idx_to_update] = epoch

        # evaluate on test data
        test_err = 0.0
        test_corr = 0.0
        test_total = 0
        test_inst = 0
        for batch in utils.iterate_minibatches(X_test,
                                               Y_test,
                                               masks=mask_test,
                                               char_inputs=C_test,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            test_err += err * inputs.shape[0]
            test_corr += corr
            test_total += num
            test_inst += inputs.shape[0]
            if output_predict:
                output_file = eval_dir + '/test%d' % epoch
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         output_file,
                                         label_alphabet,
                                         is_flattened=False)

        # print out test result
        if stop_count > 0:
            print '(cf.',
        print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            test_err / test_inst, test_corr, test_total,
            test_corr * 100 / test_total),
        if output_predict and exp_mode in ['ner', 'chunk']:
            stdout = subprocess.check_output([eval_script],
                                             stdin=open(output_file))
            f1_score = stdout.split("\n")[1].split()[7]  # this is string
            print ", f1:", f1_score
        else:
            print
        sys.stdout.flush()

        if update_acc:
            best_acc_test_err[idx_to_update] = test_err
            best_acc_test_corr[idx_to_update] = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo not in ['adam', 'adadelta']:
            if decay_rate >= 0:
                lr = learning_rate / (1.0 + epoch * decay_rate)
            else:
                if stop_count > 0 and stop_count % 3 == 0:
                    learning_rate /= 2.0
                    lr = learning_rate
            updates = utils.create_updates(loss_train,
                                           params,
                                           update_algo,
                                           lr,
                                           momentum=momentum)
            train_fn = theano.function(
                [input_var, target_var, mask_var, char_input_var],
                [loss_train_ori, loss_train_adv, corr_train, num_tokens],
                updates=updates)

    # print best performance on test data.
    for i in range(len(best_epoch_acc)):
        logger.info("final best acc test performance (at epoch %d)" %
                    best_epoch_acc[i])
        print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            best_acc_test_err[i] / test_inst, best_acc_test_corr[i],
            test_total, best_acc_test_corr[i] * 100 / test_total)
Example #7
0
__author__ = 'max'

import numpy as np
import theano

from alphabet import Alphabet
from lasagne_nlp.utils import utils as utils

root_symbol = "##ROOT##"
root_label = "<ROOT>"
word_end = "##WE##"
MAX_LENGTH = 120
MAX_CHAR_LENGTH = 45
logger = utils.get_logger("LoadData")


def read_conll_sequence_labeling(path,
                                 word_alphabet,
                                 label_alphabet,
                                 word_column=1,
                                 label_column=1):
    """
    read data from file in conll format
    :param path: file path
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param word_alphabet: alphabet of words
    :param label_alphabet: alphabet -f labels
    :return: sentences of words and labels, sentences of indexes of words and labels.
    """
Example #8
0
__author__ = 'max'

import numpy as np
import theano

from alphabet import Alphabet
from lasagne_nlp.utils import utils as utils

root_symbol = "##ROOT##"
root_label = "<ROOT>"
word_end = "##WE##"
MAX_LENGTH = 125
MAX_CHAR_LENGTH = 45
logger = utils.get_logger("LoadData")


def read_conll_sequence_labeling(path, word_alphabet, label_alphabet, word_column=1, label_column=4):
    """
    read data from file in conll format
    :param path: file path
    :param word_column: the column index of word (start from 0)
    :param label_column: the column of label (start from 0)
    :param word_alphabet: alphabet of words
    :param label_alphabet: alphabet -f labels
    :return: sentences of words and labels, sentences of indexes of words and labels.
    """

    word_sentences = []
    label_sentences = []

    word_index_sentences = []
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune the word embeddings')
    parser.add_argument('--embedding',
                        choices=['word2vec', 'glove', 'senna'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument(
        '--embedding_dict',
        default='data/word2vec/GoogleNews-vectors-negative300.bin',
        help='path for embedding dict')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in LSTM')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--gamma',
                        type=float,
                        default=1e-6,
                        help='weight for regularization')
    parser.add_argument('--peepholes',
                        action='store_true',
                        help='Peepholes for LSTM')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help='Embedding for oov word',
                        required=True)
    parser.add_argument('--update',
                        choices=['sgd', 'momentum', 'nesterov'],
                        help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular',
                        choices=['none', 'l2'],
                        help='regularization for training',
                        required=True)
    parser.add_argument('--dropout',
                        action='store_true',
                        help='Apply dropout layers')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length),
                                                    input_var=input_var,
                                                    name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(
                layer_input,
                input_size=alphabet_size,
                output_size=embedd_dim,
                W=embedd_table,
                name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length,
                                                           embedd_dim),
                                                    input_var=input_var,
                                                    name='input')
            return layer_input

    logger = utils.get_logger("BiLSTM")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                             test_path, oov=oov,
                                                                                             fine_tune=fine_tune,
                                                                                             embedding=embedding,
                                                                                             embedding_path=embedding_path)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape

    # construct input and mask layers
    layer_incoming = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var,
                                           name='mask')

    # construct bi-lstm
    num_units = args.num_units
    bi_lstm = build_BiLSTM(layer_incoming,
                           num_units,
                           mask=layer_mask,
                           grad_clipping=grad_clipping,
                           peepholes=peepholes,
                           dropout=dropout)

    # reshape bi-rnn to [batch * max_length, num_units]
    bi_lstm = lasagne.layers.reshape(bi_lstm, (-1, [2]))

    # construct output layer (dense layer with softmax)
    layer_output = lasagne.layers.DenseLayer(
        bi_lstm,
        num_units=num_labels,
        nonlinearity=nonlinearities.softmax,
        name='softmax')

    # get output of bi-rnn shape=[batch * max_length, #label]
    prediction_train = lasagne.layers.get_output(layer_output)
    prediction_eval = lasagne.layers.get_output(layer_output,
                                                deterministic=True)
    final_prediction = T.argmax(prediction_eval, axis=1)

    # flat target_var to vector
    target_var_flatten = target_var.flatten()
    # flat mask_var to vector
    mask_var_flatten = mask_var.flatten()

    # compute loss
    num_loss = mask_var_flatten.sum(dtype=theano.config.floatX)
    # for training, we use mean of loss over number of labels
    loss_train = lasagne.objectives.categorical_crossentropy(
        prediction_train, target_var_flatten)
    loss_train = (loss_train *
                  mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            layer_output, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    loss_eval = lasagne.objectives.categorical_crossentropy(
        prediction_eval, target_var_flatten)
    loss_eval = (loss_eval *
                 mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss

    # compute number of correct labels
    corr_train = lasagne.objectives.categorical_accuracy(
        prediction_train, target_var_flatten)
    corr_train = (corr_train *
                  mask_var_flatten).sum(dtype=theano.config.floatX)

    corr_eval = lasagne.objectives.categorical_accuracy(
        prediction_eval, target_var_flatten)
    corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(layer_output, trainable=True)
    updates = utils.create_updates(loss_train,
                                   params,
                                   update_algo,
                                   learning_rate,
                                   momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var],
                               [loss_train, corr_train, num_loss],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function(
        [input_var, target_var, mask_var],
        [loss_eval, corr_eval, num_loss, final_prediction])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = 5
    for epoch in range(1, num_epochs + 1):
        print('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' %
              (epoch, lr, decay_rate))
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train,
                                               Y_train,
                                               masks=mask_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, _ = batch
            err, corr, num = train_fn(inputs, targets, masks)
            train_err += err * num
            train_corr += corr
            train_total += num
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data, train_err
                / train_total, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        sys.stdout.write("\b" * num_back)
        print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' %
              (min(train_batches * batch_size,
                   num_data), num_data, train_err / train_total,
               train_corr * 100 / train_total, time.time() - start_time))

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        for batch in utils.iterate_minibatches(X_dev,
                                               Y_dev,
                                               masks=mask_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, _ = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks)
            dev_err += err * num
            dev_corr += corr
            dev_total += num
            if output_predict:
                utils.output_predictions(predictions, targets, masks,
                                         'tmp/dev%d' % epoch, label_alphabet)

        print('dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
              (dev_err / dev_total, dev_corr, dev_total,
               dev_corr * 100 / dev_total))

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            for batch in utils.iterate_minibatches(X_test,
                                                   Y_test,
                                                   masks=mask_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, _ = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks)
                test_err += err * num
                test_corr += corr
                test_total += num
                if output_predict:
                    utils.output_predictions(predictions, targets, masks,
                                             'tmp/test%d' % epoch,
                                             label_alphabet)

            print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
                  (test_err / test_total, test_corr, test_total,
                   test_corr * 100 / test_total))

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        lr = learning_rate / (1.0 + epoch * decay_rate)
        updates = utils.create_updates(loss_train,
                                       params,
                                       update_algo,
                                       lr,
                                       momentum=momentum)
        train_fn = theano.function([input_var, target_var, mask_var],
                                   [loss_train, corr_train, num_loss],
                                   updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" %
                (best_epoch_loss))
    print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
          (best_loss_test_err / test_total, best_loss_test_corr, test_total,
           best_loss_test_corr * 100 / test_total))
    logger.info("final best acc test performance (at epoch %d)" %
                (best_epoch_acc))
    print('test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' %
          (best_acc_test_err / test_total, best_acc_test_corr, test_total,
           best_acc_test_corr * 100 / test_total))
Example #10
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional RNN')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna'], help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', default='data/word2vec/GoogleNews-vectors-negative300.bin',
                        help='path for embedding dict')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in RNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True)
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov'], help='update algorithm', default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training',
                        required=True)
    parser.add_argument('--dropout', action='store_true', help='Apply dropout layers')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size,
                                                            output_size=embedd_dim, W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var,
                                                    name='input')
            return layer_input

    logger = utils.get_logger("BiRNN")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                             test_path, oov=oov,
                                                                                             fine_tune=fine_tune,
                                                                                             embedding=embedding,
                                                                                             embedding_path=embedding_path)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape

    # construct input and mask layers
    layer_incoming = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask')

    # construct bi-rnn
    num_units = args.num_units
    bi_rnn = build_BiRNN(layer_incoming, num_units, mask=layer_mask, grad_clipping=grad_clipping,
                         dropout=dropout)

    # reshape bi-rnn to [batch * max_length, num_units]
    bi_rnn = lasagne.layers.reshape(bi_rnn, (-1, [2]))

    # construct output layer (dense layer with softmax)
    layer_output = lasagne.layers.DenseLayer(bi_rnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
                                             name='softmax')

    # get output of bi-rnn shape=[batch * max_length, #label]
    prediction_train = lasagne.layers.get_output(layer_output)
    prediction_eval = lasagne.layers.get_output(layer_output, deterministic=True)
    final_prediction = T.argmax(prediction_eval, axis=1)

    # flat target_var to vector
    target_var_flatten = target_var.flatten()
    # flat mask_var to vector
    mask_var_flatten = mask_var.flatten()

    # compute loss
    num_loss = mask_var_flatten.sum(dtype=theano.config.floatX)
    # for training, we use mean of loss over number of labels
    loss_train = lasagne.objectives.categorical_crossentropy(prediction_train, target_var_flatten)
    loss_train = (loss_train * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss
    ############################################
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(layer_output, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty
    # dima regularization?
    # if regular == 'dima':
    #     params_regular = utils.get_all_params_by_name(layer_output, name=['forward.hidden_to_hidden.W',
    #                                                                       'backward.hidden_to_hidden.W'])
    #     dima_penalty = lasagne.regularization.apply_penalty(params_regular, dima)
    #     loss_train = loss_train + gamma * dima_penalty

    loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval, target_var_flatten)
    loss_eval = (loss_eval * mask_var_flatten).sum(dtype=theano.config.floatX) / num_loss

    # compute number of correct labels
    corr_train = lasagne.objectives.categorical_accuracy(prediction_train, target_var_flatten)
    corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX)

    corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval, target_var_flatten)
    corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(layer_output, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss], updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var], [loss_eval, corr_eval, num_loss, final_prediction])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f)..." \
        % (
        update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = 5
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, batch_size=batch_size, shuffle=True):
            inputs, targets, masks, _ = batch
            err, corr, num = train_fn(inputs, targets, masks)
            train_err += err * num
            train_corr += corr
            train_total += num
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_total, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / train_total, train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, batch_size=batch_size):
            inputs, targets, masks, _ = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks)
            dev_err += err * num
            dev_corr += corr
            dev_total += num
            if output_predict:
                utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet)

        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_total, dev_corr, dev_total, dev_corr * 100 / dev_total)

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, batch_size=batch_size):
                inputs, targets, masks, _ = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks)
                test_err += err * num
                test_corr += corr
                test_total += num
                if output_predict:
                    utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet)

            print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
                test_err / test_total, test_corr, test_total, test_corr * 100 / test_total)

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        lr = learning_rate / (1.0 + epoch * decay_rate)
        updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
        train_fn = theano.function([input_var, target_var, mask_var], [loss_train, corr_train, num_loss],
                                   updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_loss_test_err / test_total, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_acc_test_err / test_total, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
Example #11
0
def main():
    args = read_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(
                shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(
                layer_input, input_size=alphabet_size, output_size=embedd_dim,
                W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(
                shape=(None, max_length, embedd_dim), input_var=input_var,
                name='input')
            return layer_input

    logger = utils.get_logger("BiRNN")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, \
    mask_test, embedd_table, label_alphabet, _, _, _, _ = data_processor.load_dataset_sequence_labeling(
        train_path, dev_path,
        test_path, oov=oov,
        fine_tune=fine_tune,
        embedding=embedding,
        embedding_path=embedding_path)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape

    # construct input and mask layers
    layer_incoming = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var, name='mask')

    # construct bi-rnn
    num_units = args.num_units
    bi_rnn = build_BiRNN(layer_incoming, num_units, mask=layer_mask,
                         grad_clipping=grad_clipping, dropout=dropout)

    # reshape bi-rnn to [batch * max_length, num_units]
    bi_rnn = lasagne.layers.reshape(bi_rnn, (-1, [2]))

    # construct output layer (dense layer with softmax)
    layer_output = lasagne.layers.DenseLayer(
        bi_rnn, num_units=num_labels, nonlinearity=nonlinearities.softmax,
        name='softmax')

    # get output of bi-rnn shape=[batch * max_length, #label]
    prediction_train = lasagne.layers.get_output(layer_output)
    prediction_eval = lasagne.layers.get_output(layer_output,
                                                deterministic=True)
    final_prediction = T.argmax(prediction_eval, axis=1)

    # flat target_var to vector
    target_var_flatten = target_var.flatten()
    # flat mask_var to vector
    mask_var_flatten = mask_var.flatten()

    # compute loss
    num_loss = mask_var_flatten.sum(dtype=theano.config.floatX)
    # for training, we use mean of loss over number of labels
    loss_train = lasagne.objectives.categorical_crossentropy(prediction_train,
                                                             target_var_flatten)
    loss_train = (loss_train * mask_var_flatten).sum(
        dtype=theano.config.floatX) / num_loss
    ############################################
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            layer_output, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    loss_eval = lasagne.objectives.categorical_crossentropy(prediction_eval,
                                                            target_var_flatten)
    loss_eval = (loss_eval * mask_var_flatten).sum(
        dtype=theano.config.floatX) / num_loss

    # compute number of correct labels
    corr_train = lasagne.objectives.categorical_accuracy(prediction_train,
                                                         target_var_flatten)
    corr_train = (corr_train * mask_var_flatten).sum(dtype=theano.config.floatX)

    corr_eval = lasagne.objectives.categorical_accuracy(prediction_eval,
                                                        target_var_flatten)
    corr_eval = (corr_eval * mask_var_flatten).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(layer_output, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo,
                                   learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var],
                               [loss_train, corr_train, num_loss],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var],
                              [loss_eval, corr_eval, num_loss,
                               final_prediction])

    # Finally, launch the training loop.
    log_start(batch_size, dropout, fine_tune, gamma, grad_clipping, logger,
              num_data, regular, update_algo)
    num_batches = num_data / batch_size
    num_epochs = args.epochs
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = 5

    safe_mkdir('tmp')
    for epoch in range(1, num_epochs + 1):
        logger.info('Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (
            epoch, lr, decay_rate))
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        start_time = time.time()
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train,
                                               masks=mask_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, _ = batch
            err, corr, num = train_fn(inputs, targets, masks)
            train_err += err * num
            train_corr += corr
            train_total += num
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave


        # update training log after each epoch
        print('train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / train_total, train_corr * 100 / train_total,
            time.time() - start_time))

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, _ = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks)
            dev_err += err * num
            dev_corr += corr
            dev_total += num
            if output_predict:
                utils.output_predictions(predictions, targets, masks,
                                         'tmp/dev%d' % epoch, label_alphabet)

        log_loss('dev', dev_corr, dev_err, dev_total, logger)

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            for batch in utils.iterate_minibatches(X_test, Y_test,
                                                   masks=mask_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, _ = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks)
                test_err += err * num
                test_corr += corr
                test_total += num
                if output_predict:
                    utils.output_predictions(predictions, targets, masks,
                                             'tmp/test%d' % epoch,
                                             label_alphabet)

            log_loss('test', test_corr, test_err, test_total, logger)

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        lr = learning_rate / (1.0 + epoch * decay_rate)
        updates = utils.create_updates(loss_train, params, update_algo, lr,
                                       momentum=momentum)
        train_fn = theano.function([input_var, target_var, mask_var],
                                   [loss_train, corr_train, num_loss],
                                   updates=updates)

    # print best performance on test data.
    logger.info(
        "final best loss test performance (at epoch %d)" % best_epoch_loss)
    log_loss('best loss in test', corr=best_loss_test_corr,
             error=best_loss_test_err, total=test_total, logger=logger)
    logger.info(
        "final best acc test performance (at epoch %d)" % best_epoch_acc)
    log_loss('best accuracy in test', corr=best_acc_test_corr,
             error=best_acc_test_err, total=test_total, logger=logger)

    # Log last predictions
    # Compile a third function evaluating the final predictions only
    predict_fn = theano.function([input_var, mask_var],
                              [final_prediction], allow_input_downcast=True)
    predictions = predict_fn(X_test, mask_test)[0]
    utils.output_predictions(predictions, Y_dev, mask_test,
                             'tmp/final_test', label_alphabet)
Example #12
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', default=None, help='path for embedding dict')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM')
    parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True)
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', action='store_true', help='Apply dropout layers')
    parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
    parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size,
                                                            output_size=embedd_dim,
                                                            W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length),
                                                     input_var=char_input_var, name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-CNN-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, \
    C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path, oov=oov,
                                                                                              fine_tune=fine_tune,
                                                                                              embedding=embedding,
                                                                                              embedding_path=embedding_path,
                                                                                              use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # construct input and mask layers
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask')

    # construct bi-rnn-cnn
    num_units = args.num_units

    bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask,
                                           grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters,
                                           dropout=dropout)

    logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters))

    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf)
    energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True)

    loss_train = crf_loss(energies_train, target_var, mask_var).mean()
    loss_eval = crf_loss(energies_eval, target_var, mask_var).mean()
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = crf_accuracy(energies_train, target_var)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                              [loss_eval, corr_eval, num_tokens, prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train,
                                               batch_size=batch_size, shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_inst, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet,
                                         is_flattened=False)

        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total)

        if best_loss < dev_err and best_acc > dev_corr / dev_total:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < dev_corr / dev_total:
                update_acc = True
                best_acc = dev_corr / dev_total
                best_epoch_acc = epoch

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            test_inst = 0
            for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, char_inputs = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)
                test_err += err * inputs.shape[0]
                test_corr += corr
                test_total += num
                test_inst += inputs.shape[0]
                if output_predict:
                    utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet,
                                             is_flattened=False)

            print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
                test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total)

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
            train_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                                        [loss_train, corr_train, num_tokens],
                                        updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
Example #13
0
def main():
    parser = argparse.ArgumentParser(description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings')
    parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'random'], help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict', default=None, help='path for embedding dict')
    parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch')
    parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN')
    parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate')
    parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping')
    parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization')
    parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM')
    parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True)
    parser.add_argument('--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta'], help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True)
    parser.add_argument('--dropout', action='store_true', help='Apply dropout layers')
    parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
    parser.add_argument('--output_prediction', default='true', action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument('--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument('--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument('--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    parser.add_argument("--model")  # model name

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(layer_input, input_size=alphabet_size,
                                                            output_size=embedd_dim,
                                                            W=embedd_table, name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length),
                                                     input_var=char_input_var, name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char_input, input_size=char_alphabet_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-CNN-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout
    modelname = args.model
    # 读取数据训练集,dev集,测试集
    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    embedd_table, label_alphabet, word_alphabet, \
    C_train, C_dev, C_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path, oov=oov,
                                                                                              fine_tune=fine_tune,
                                                                                              embedding=embedding,
                                                                                              embedding_path=embedding_path,
                                                                                              use_character=True)
    print 'label_alphabet'
    for i in range(label_alphabet.size()):
        print i
        print label_alphabet.get_instance(i)

    # print Y_test, Y_test.shape; sys.exit(1)
    my_size = data_processor.MAX_LENGTH_TRAIN
    my_size = data_processor.MY_MAX_LENGTH
    print "\tMY_SIZE", my_size, data_processor.MAX_LENGTH_TRAIN
    # my_size = data_processor.MAX_LENGTH_DEV
    print "\tMYSIZE", my_size
    num_labels = label_alphabet.size() - 1
    # 构建网络
    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)
    # 构建输入层
    # construct input and mask layers
    logger.info("construct input and mask layers...")
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask')

    # construct bi-rnn-cnn
    logger.info("construct bi-rnn-cnn...")
    num_units = args.num_units

    bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1, layer_incoming2, num_units, num_labels, mask=layer_mask,
                                           grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters,
                                           dropout=dropout)

    logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters))

    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf)
    energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf, deterministic=True)

    loss_train = crf_loss(energies_train, target_var, mask_var).mean()
    # print loss_train; sys.exit(1)
    loss_eval = crf_loss(energies_eval, target_var, mask_var).mean()
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(bi_lstm_cnn_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = crf_accuracy(energies_train, target_var)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True)
    updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function([input_var, target_var, mask_var, char_input_var], [loss_train, corr_train, num_tokens],
                               updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                              [loss_eval, corr_eval, num_tokens, prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    # num_epochs = 1
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):

        print 'Epoch %d (learning rate=%.4f, decay rate=%.4f): ' % (epoch, lr, decay_rate)
        train_err = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0
        for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train,
                                               batch_size=batch_size, shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err, corr, num = train_fn(inputs, targets, masks, char_inputs)
            train_err += err * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            sys.stdout.write("\b" * num_back)
            log_info = 'train: %d/%d loss: %.4f, acc: %.2f%%, time left (estimated): %.2fs' % (
                min(train_batches * batch_size, num_data), num_data,
                train_err / train_inst, train_corr * 100 / train_total, time_left)
            sys.stdout.write(log_info)
            num_back = len(log_info)
        # update training log after each epoch
        assert train_inst == num_data
        sys.stdout.write("\b" * num_back)
        print 'train: %d/%d loss: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err / num_data, train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        test_err_sentence = 0
        my_f1 = {}
        my_prs = []
        my_trs = []
        for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)

            # print "-->HERE COMES THE PREDS",predictions
            # print predictions.shape,type(predictions);
            for i in xrange(batch_size):
                try:
                    input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])]
                except IndexError:
                    continue
                if len(input_clear) == 0: continue
                try:
                    dev_size = input_clear.index(None)
                except ValueError:
                    dev_size = my_size
                    # print dev_size
                my_trs += list(targets[i, :dev_size])
                my_prs += list(predictions[i, :dev_size])
                # for j in xrange(len(targets[i,:dev_size])):
            #  pr = predictions[i,j]
            #  tr = targets[i,j]
            #  my_f1[(pr,tr)] = my_f1.get((pr,tr),0)+1
            # input_clear = [word_alphabet.get_instance(y) for y in list(inputs[0,:])]
            # print input_clear
            # my_f1 = f1_score(my_trs,my_prs,average="macro")
            # print [label_alphabet.get_instance(y+1) for y in list(targets[0,:])]
            # print targets[0,:],predictions[0,:],inputs.shape[0],my_f1; sys.exit(1)
            # print err,inputs.shape[0]

            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                utils.output_predictions(predictions, targets, masks, 'tmp/dev%d' % epoch, label_alphabet,
                                         is_flattened=False)

        dev_f1 = f1_score(my_trs, my_prs, average="macro")
        classify_report = metrics.classification_report(my_trs, my_prs)
        print 'dev classify_report'
        print classify_report
        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%, f1: %.4f' % (
            dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total, dev_f1)

        # CHANGE THIS IF NECESSARY
        # MODEL SELECTION ON DEV CRITERION, SE
        useF1 = True
        useLoss = False
        criterion = dev_f1 if useF1 else dev_corr / dev_total

        if best_loss < dev_err and best_acc > criterion:
            stop_count += 1
        else:
            update_loss = False
            update_acc = False
            stop_count = 0
            if best_loss > dev_err:
                update_loss = True
                best_loss = dev_err
                best_epoch_loss = epoch
            if best_acc < criterion:
                update_acc = True
                best_acc = criterion
                best_epoch_acc = epoch
            else:
                if useLoss == False:
                    continue

            # evaluate on test data when better performance detected
            test_err = 0.0
            test_corr = 0.0
            test_total = 0
            test_inst = 0
            test_err_sentence = 0
            test_sentences = 0
            print "#SAVING MODEL"
            np.savez(modelname, *lasagne.layers.get_all_param_values(bi_lstm_cnn_crf))
            test_prs = []
            test_trs = []
            for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test,
                                                   batch_size=batch_size):
                inputs, targets, masks, char_inputs = batch
                err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs)

                # print "-->HERE COMES THE PREDS",predictions
                # print predictions.shape,type(predictions);
                for i in xrange(batch_size):
                    try:
                        input_clear = [word_alphabet.get_instance(y) for y in list(inputs[i, :])]
                    except IndexError:
                        continue
                    if len(input_clear) == 0: continue
                    try:
                        test_size = input_clear.index(None)
                    except ValueError:
                        test_size = my_size
                        # print dev_size
                    test_trs += list(targets[i, :test_size])
                    test_prs += list(predictions[i, :test_size])

                # print predictions # SE
                # print "AAA",inputs[0],len(inputs[0]),inputs[0][0]
                input_clear = [word_alphabet.get_instance(y) for x in inputs for y in
                               list(x)]  # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE
                target_clear = [label_alphabet.get_instance(y + 1) for x in targets for y in list(x)]
                target_clear_pred = [label_alphabet.get_instance(y + 1) for x in predictions for y in list(x)]  # SE

                # print my_size
                # comment this out
                # my_size = 652
                # print input_clear; sys.exit(1)
                # print input_clear
                for ii in range(batch_size):
                    Z = input_clear[ii * my_size:(ii + 1) * my_size]
                    if len(Z) == 0: continue
                    try:
                        size = Z.index(None)
                    except ValueError:
                        size = my_size
                        # print size
                    itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size]
                    EMPTY = "EMPTY"
                    EMPTY = "EMPTY_EMPTY"
                    otruth = filter(lambda z: z != EMPTY, target_clear[ii * my_size:(ii + 1) * my_size][:size])
                    opred = filter(lambda z: z != EMPTY, target_clear_pred[ii * my_size:(ii + 1) * my_size][:size])
                    if otruth == opred:
                        test_err_sentence += 1
                        # print "CORRECT",itruth,otruth,opred
                        print "#CORRECT %%%", len(itruth), len(opred), len(otruth)
                        printout(itruth, opred, otruth)
                        print
                    else:
                        print "#WRONG %%%"  # ,itruth,otruth,opred
                        printout(itruth, opred, otruth)
                        print
                    test_sentences += 1

                test_err += err * inputs.shape[0]
                test_corr += corr
                test_total += num
                test_inst += inputs.shape[0]
                if output_predict:
                    utils.output_predictions(predictions, targets, masks, 'tmp/test%d' % epoch, label_alphabet,
                                             is_flattened=False)

            test_f1 = f1_score(test_trs, test_prs, average="macro")
            test_classify_report = metrics.classification_report(test_trs, test_prs)
            print 'label_alphabet'
            for i in range(label_alphabet.size()):
                print i
                print label_alphabet.get_instance(i)
            print 'Epoch %d test classify_report' % epoch
            print test_classify_report
            print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%% f1: %.4f' % (
                test_err / test_inst, test_corr, test_total,
                test_corr * 100 / test_total,
                test_f1), test_err_sentence * 1.0 / test_sentences, test_err_sentence, test_sentences

            if update_loss:
                best_loss_test_err = test_err
                best_loss_test_corr = test_corr
            if update_acc:
                best_acc_test_err = test_err
                best_acc_test_corr = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo != 'adadelta':
            lr = learning_rate / (1.0 + epoch * decay_rate)
            updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum)
            train_fn = theano.function([input_var, target_var, mask_var, char_input_var],
                                       [loss_train, corr_train, num_tokens],
                                       updates=updates)

    # print best performance on test data.
    logger.info("final best loss test performance (at epoch %d)" % best_epoch_loss)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_loss_test_err / test_inst, best_loss_test_corr, test_total, best_loss_test_corr * 100 / test_total)
    logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc)
    print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
        best_acc_test_err / test_inst, best_acc_test_corr, test_total, best_acc_test_corr * 100 / test_total)
Example #14
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune the word embeddings')
    parser.add_argument('--embedding',
                        choices=['word2vec', 'glove', 'senna', 'random'],
                        help='Embedding for words',
                        required=True)
    parser.add_argument('--embedding_dict',
                        default=None,
                        help='path for embedding dict')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters',
                        type=int,
                        default=20,
                        help='Number of filters in CNN')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--gamma',
                        type=float,
                        default=1e-6,
                        help='weight for regularization')
    parser.add_argument('--peepholes',
                        action='store_true',
                        help='Peepholes for LSTM')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help='Embedding for oov word',
                        required=True)
    parser.add_argument('--update',
                        choices=['sgd', 'momentum', 'nesterov', 'adadelta'],
                        help='update algorithm',
                        default='sgd')
    parser.add_argument('--regular',
                        choices=['none', 'l2'],
                        help='regularization for training',
                        required=True)
    parser.add_argument('--dropout',
                        action='store_true',
                        help='Apply dropout layers')
    parser.add_argument('--patience',
                        type=int,
                        default=5,
                        help='Patience for early stopping')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument(
        '--train')  # "data/POS-penn/wsj/split1/wsj1.train.original"
    parser.add_argument(
        '--dev')  # "data/POS-penn/wsj/split1/wsj1.dev.original"
    parser.add_argument(
        '--test')  # "data/POS-penn/wsj/split1/wsj1.test.original"
    parser.add_argument('--realtest')
    parser.add_argument('--mymodel')

    args = parser.parse_args()

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length),
                                                    input_var=input_var,
                                                    name='input')
            layer_embedding = lasagne.layers.EmbeddingLayer(
                layer_input,
                input_size=alphabet_size,
                output_size=embedd_dim,
                W=embedd_table,
                name='embedding')
            return layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length,
                                                           embedd_dim),
                                                    input_var=input_var,
                                                    name='input')
            return layer_input

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None,
                                                            max_sent_length,
                                                            max_char_length),
                                                     input_var=char_input_var,
                                                     name='char-input')
        layer_char_input = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')
        layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                          pattern=(0, 2, 1))
        return layer_char_input

    logger = utils.get_logger("BiLSTM-CNN-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    real_test_path = args.realtest
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    num_filters = args.num_filters
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout
    mymodel = args.mymodel
    print "Model is", mymodel, test_path, real_test_path

    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, _X_real_test, _Y_real_test, _mask_real_test, \
    embedd_table, label_alphabet, word_alphabet, \
    C_train, C_dev, C_test, _C_real_test, char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                                                              test_path, test_path, oov=oov,
                                                                                              fine_tune=fine_tune,
                                                                                              embedding=embedding,
                                                                                              embedding_path=embedding_path,
                                                                                                           use_character=True)

    _X_train, _Y_train, _mask_train, _X_dev, _Y_dev, _mask_dev, _X_test, _Y_test, _mask_test, X_real_test, Y_real_test, mask_real_test, \
    _embedd_table, _label_alphabet, _word_alphabet, \
    _C_train, _C_dev, _C_test, C_real_test, _char_embedd_table = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                                                            test_path, real_test_path, oov=oov,fine_tune=fine_tune,
                                                            embedding=embedding,
                                                            embedding_path=embedding_path,use_character=True)

    #print _C_train.shape,_C_dev.shape,_C_test.shape,C_real_test.shape; #sys.exit(1)
    my_size = data_processor.MAX_LENGTH_TRAIN
    my_size = data_processor.MY_MAX_LENGTH
    #my_size = data_processor.MAX_LENGTH_DEV
    print "\tMYSIZE", my_size, C_real_test.shape, C_test.shape, C_train.shape
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # construct input and mask layers
    layer_incoming1 = construct_char_input_layer()
    layer_incoming2 = construct_input_layer()

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var,
                                           name='mask')

    # construct bi-rnn-cnn
    num_units = args.num_units

    bi_lstm_cnn_crf = build_BiLSTM_CNN_CRF(layer_incoming1,
                                           layer_incoming2,
                                           num_units,
                                           num_labels,
                                           mask=layer_mask,
                                           grad_clipping=grad_clipping,
                                           peepholes=peepholes,
                                           num_filters=num_filters,
                                           dropout=dropout)
    #    bi_lstm_cnn_crf = None

    logger.info("Network structure: hidden=%d, filter=%d" %
                (num_units, num_filters))

    # compute loss
    num_tokens = mask_var.sum(dtype=theano.config.floatX)

    # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
    energies_train = lasagne.layers.get_output(bi_lstm_cnn_crf)
    energies_eval = lasagne.layers.get_output(bi_lstm_cnn_crf,
                                              deterministic=True)

    loss_train = crf_loss(energies_train, target_var, mask_var).mean()
    loss_eval = crf_loss(energies_eval, target_var, mask_var).mean()
    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            bi_lstm_cnn_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    _, corr_train = crf_accuracy(energies_train, target_var)
    corr_train = (corr_train * mask_var).sum(dtype=theano.config.floatX)
    prediction_eval, corr_eval = crf_accuracy(energies_eval, target_var)
    corr_eval = (corr_eval * mask_var).sum(dtype=theano.config.floatX)

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = lasagne.layers.get_all_params(bi_lstm_cnn_crf, trainable=True)
    updates = utils.create_updates(loss_train,
                                   params,
                                   update_algo,
                                   learning_rate,
                                   momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_train, corr_train, num_tokens],
        updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_eval, corr_eval, num_tokens, prediction_eval])
    my_prediction_eval = my_crf_accuracy(energies_eval)
    my_eval_fn = theano.function([input_var, mask_var, char_input_var],
                                 [my_prediction_eval])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s)..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_loss = 1e+12
    best_acc = 0.0
    best_epoch_loss = 0
    best_epoch_acc = 0
    best_loss_test_err = 0.
    best_loss_test_corr = 0.
    best_acc_test_err = 0.
    best_acc_test_corr = 0.
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    print "#LOADING MODEL"
    #np.savez("model.npz",*lasagne.layers.get_all_param_values(bi_lstm_cnn_crf))
    # just load the data, see here:
    # https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py
    #try: mymodel = sys.argv[1]
    #except IndexError:
    #    mymodel = "models.npz"
    with np.load(mymodel) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    lasagne.layers.set_all_param_values(bi_lstm_cnn_crf, param_values)
    correct = 0
    total = 0
    print dir(bi_lstm_cnn_crf)
    #print bi_lstm_cnn_crf.predict([1,2,3,4])
    #sys.exit(1)
    print X_real_test.shape, Y_real_test.shape, C_real_test.shape, mask_real_test.shape
    # that's a stupid hack
    #C_real_test = C_real_test[:len(X_real_test)]

    #print X_real_test[0:1]
    #print my_eval_fn(X_real_test[0:1],mask_real_test[0:1],C_real_test[0:1])
    #sys.exit(1)
    for batch in utils.iterate_minibatches(X_real_test,
                                           Y_real_test,
                                           masks=mask_real_test,
                                           char_inputs=C_real_test,
                                           batch_size=batch_size):
        inputs, targets, masks, char_inputs = batch
        #print inputs,targets,masks,char_inputs; sys.exit(1)
        err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                              char_inputs)
        #print predictions # SE
        input_clear = [
            word_alphabet.get_instance(y) for x in inputs for y in list(x)
        ]  # predictions,dir(label_alphabet),label_alphabet.get_instance(4) # SE
        target_clear = [
            label_alphabet.get_instance(y + 1) for x in targets
            for y in list(x)
        ]
        target_clear_pred = [
            label_alphabet.get_instance(y + 1) for x in predictions
            for y in list(x)
        ]  # SE
        #print my_size
        # comment this out
        #my_size = 557
        #print input_clear; sys.exit(1)
        for ii in range(batch_size):
            Z = input_clear[ii * my_size:(ii + 1) * my_size]
            if len(Z) == 0: continue
            try:
                size = Z.index(None)
            except ValueError:
                size = my_size
            #print size
            itruth = input_clear[ii * my_size:(ii + 1) * my_size][:size]
            otruth = filter(
                lambda z: z != "EMPTY",
                target_clear[ii * my_size:(ii + 1) * my_size][:size])
            opred = filter(
                lambda z: z != "EMPTY",
                target_clear_pred[ii * my_size:(ii + 1) * my_size][:size])
            total += len(opred)
            correct += len(
                filter(lambda x: x == True,
                       [otruth[jj] == opred[jj] for jj in xrange(len(opred))]))
            if otruth == opred:
                #test_err_sentence += 1
                #print "CORRECT",itruth,otruth,opred
                #print "#CORRECT %%%"
                printout(itruth, opred, otruth)
                print
            else:
                #print "#WRONG %%%" #itruth,otruth,opred
                printout(itruth, opred, otruth)
                print

    print correct, total, correct * 1.0 / total