def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(
        args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(
            args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2**sum(args.fold_flags)) == 0

    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)

    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(
        borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size

    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels = set(train_set_y.get_value().tolist())

    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]

    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x')  # the word indices matrix
    y = T.ivector('y')  # the sentiment labels

    batch_index = T.iscalar('batch_index')

    rng = np.random.RandomState(1234)

    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(
        rng,
        input=x,
        vocab_size=len(word2index),
        embed_dm=args.embed_dm,
        embeddings=(pretrained_embeddings
                    if args.use_pretrained_embedding else None))

    dropout_layers = [layer1]
    layers = [layer1]

    for i in range(args.conv_layer_n):
        fold_flag = args.fold_flags[i]

        # for the dropout layer
        dpl = DropoutLayer(input=dropout_layers[-1].output,
                           rng=rng,
                           dropout_rate=args.dropout_rates[0])
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output

        # for the conv layer
        filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]),
                        1, args.filter_widths[i])

        k = args.ks[i]

        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % (
            args.conv_activation_unit, i + 2, filter_shape, k,
            args.dropout_rates[i], args.norm_w, fold_flag)

        # we have two layers adding to two paths repsectively,
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_dropout_input,
            filter_shape=filter_shape,
            k=k,
            norm_w=args.norm_w,
            fold=fold_flag,
            activation=args.conv_activation_unit)

        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_input,
            filter_shape=filter_shape,
            k=k,
            activation=args.conv_activation_unit,
            fold=fold_flag,
            W=dropout_conv_layer.W *
            (1 - args.dropout_rates[i]),  # model averaging
            b=dropout_conv_layer.b)

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)

    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(
            args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm

    print "For output layer, n_in = %d, dropout_rate = %f" % (
        n_in, args.dropout_rates[-1])

    dropout_output_layer = LogisticRegression(
        rng,
        input=dropout_layers[-1].output.flatten(2),
        n_in=n_in,  # divided by 2x(how many times are folded)
        n_out=len(possible_labels)  # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input=layers[-1].output.flatten(2),
        n_in=n_in,
        n_out=len(possible_labels),
        W=dropout_output_layer.W *
        (1 - args.dropout_rates[-1]),  # sharing the parameters, don't forget
        b=dropout_output_layer.b)

    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)

    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 *
            ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])

    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])

    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost

    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" % (
        args.embedding_learning_delay_epochs)

    print "param_layers: %r" % dropout_layers
    param_layers = dropout_layers

    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho,
                                                             args.epsilon)

    params = [param for layer in param_layers for param in layer.params]
    param_shapes = [
        param for layer in param_layers for param in layer.param_shapes
    ]

    param_grads = [T.grad(cost, param) for param in params]

    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Eg:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Ex:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    new_egs = [
        args.rho * eg + (1 - args.rho) * g**2
        for eg, g in zip(egs, param_grads)
    ]

    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]

    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx**2)
        for ex, dx in zip(exs, delta_x)
    ]

    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [(p, p + dx)
                     for dx, g, p in zip(delta_x, param_grads, params)]

    updates = egs_updates + exs_updates + param_updates

    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd

    def make_train_func(cost, updates):
        return theano.function(
            inputs=[batch_index],
            outputs=[cost],
            updates=updates,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(
            inputs=[],
            outputs=errors,
            givens={
                x: x_val,
                y: y_val
            },
        )

    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm

    train_data_at_index = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
    }

    train_data_at_index_with_y = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
        y:
        train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs=[batch_index],
            outputs=dropout_cost,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr)

        get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd)

    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs=[],
            outputs=[
                theano.printing.Print(param.name)(T.mean(T.abs_(param_grad)))
                for param, param_grad in zip(params, param_grads)
            ],
            givens={
                x: train_set_x,
                y: train_set_y
            })

    activations = [l.output for l in dropout_layers[1:-1]]
    weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in weight_grads],
            givens=train_data_at_index_with_y)

    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer

        get_activation_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(val) for val in activations],
            givens=train_data_at_index)

        get_activation_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(val) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

        get_weight_grad_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0

    nnls = []
    L2_sqrs = []

    activation_means = [[] for i in range(args.conv_layer_n)]
    activation_stds = [[] for i in range(args.conv_layer_n)]
    weight_grad_means = [[] for i in range(args.conv_layer_n)]
    weight_grad_stds = [[] for i in range(args.conv_layer_n)]
    activation_hist_data = [[] for i in range(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in range(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" % validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data
            train_set_x_data = train_set_x.get_value(borrow=True)
            train_set_y_data = train_set_y.get_value(borrow=True)

            permutation = np.random.permutation(
                train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in range(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)

                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in range(n_train_batches)])
                    dev_error_val = dev_error()

                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch,
                    #     minibatch_index,
                    #     train_error_val * 100,
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % (
                        epoch, minibatch_index, dev_error_val * 100)

                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)

                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(('     epoch %i, minibatch %i/%i, test error of'
                               ' best dev error %f %%') %
                              (epoch, minibatch_index + 1, n_train_batches,
                               test_error_val * 100.))

                        print "Dumping model to %s" % (args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index +
                        1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" % (
                        minibatch_index + 1, n_train_batches)
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" % (
                            np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" % (
                            np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []

                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)

                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())

                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            activation_means, activation_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)

                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            weight_grad_means, weight_grad_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(
                            activation_hist_data,
                            get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(
                            weight_grad_hist_data,
                            get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()

    except:
        import traceback
        traceback.print_exc(file=sys.stdout)
    finally:
        from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, activation_stds,
                       "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, weight_grad_stds,
                       "weight_grad_tracking")

        if print_config["activation_hist"]:
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(
                train_errors,
                dev_errors,
                title=('Best dev score: %f %% '
                       ' at iter %i with test error %f %%') %
                (best_validation_loss * 100., best_iter + 1,
                 test_error_val * 100.))
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")

    end_time = time.clock()

    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))

    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %
                (args.task_signature, best_validation_loss, test_error_val))

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                      dtype = theano.config.floatX)

layer3 = LogisticRegression(rng = rng, 
                            input = layer2.output.flatten(2), 
                            n_in = n_in, 
                            n_out = n_out,
                            W = theano.shared(value = W_logreg, name = "W_logreg"),
                            b = theano.shared(value = b_logreg, name = "b_logreg")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.nnl(y_symbol)
)

f2 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.errors(y_symbol)
)

f3 = theano.function(inputs = [x_symbol], 
                     outputs = layer3.p_y_given_x
)

f_el = theano.function(inputs = [x_symbol], 
                       outputs = layer1.output
)

f_cl = theano.function(inputs = [x_symbol], 
                       outputs = layer2.output
)

#########################
Example #3
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2 ** sum(args.fold_flags)) == 0
    
    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)
    
    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    
    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels =  set(train_set_y.get_value().tolist())
    
    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]
        
    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x') # the word indices matrix
    y = T.ivector('y') # the sentiment labels

    batch_index = T.iscalar('batch_index')
    
    rng = np.random.RandomState(1234)
    
    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(rng, 
                                input = x, 
                                vocab_size = len(word2index),
                                embed_dm = args.embed_dm, 
                                embeddings = (
                                    pretrained_embeddings 
                                    if args.use_pretrained_embedding else None
                                )
    )
    
    dropout_layers = [layer1]
    layers = [layer1]
    
    for i in xrange(args.conv_layer_n):
        fold_flag = args.fold_flags[i]
        
        # for the dropout layer
        dpl = DropoutLayer(
            input = dropout_layers[-1].output,
            rng = rng, 
            dropout_rate = args.dropout_rates[0]
        ) 
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output
        
        # for the conv layer
        filter_shape = (
            args.nkerns[i],
            (1 if i == 0 else args.nkerns[i-1]), 
            1, 
            args.filter_widths[i]
        )
        
        k = args.ks[i]
        
        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" %(
            args.conv_activation_unit, 
            i+2, 
            filter_shape, 
            k, 
            args.dropout_rates[i], 
            args.norm_w, 
            fold_flag
        )
        
        # we have two layers adding to two paths repsectively, 
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(rng, 
                                                  input = next_layer_dropout_input,
                                                  filter_shape = filter_shape, 
                                                  k = k, 
                                                  norm_w = args.norm_w,
                                                  fold = fold_flag,
                                                  activation = args.conv_activation_unit)
    
        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(rng, 
                                          input = next_layer_input,
                                          filter_shape = filter_shape,
                                          k = k,
                                          activation = args.conv_activation_unit,
                                          fold = fold_flag,
                                          W = dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging
                                          b = dropout_conv_layer.b
        )

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)
    
    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm
        
    print "For output layer, n_in = %d, dropout_rate = %f" %(n_in, args.dropout_rates[-1])
    
    dropout_output_layer = LogisticRegression(
        rng,
        input = dropout_layers[-1].output.flatten(2), 
        n_in = n_in, # divided by 2x(how many times are folded)
        n_out = len(possible_labels) # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input = layers[-1].output.flatten(2), 
        n_in = n_in,
        n_out = len(possible_labels),
        W = dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget
        b = dropout_output_layer.b
    )
    
    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)
    
    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings) ** 2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])
    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])
    
    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost
    
    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" %(args.embedding_learning_delay_epochs)
        
    print "param_layers: %r" %dropout_layers
    param_layers = dropout_layers
    
    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" %(args.rho, args.epsilon)
    
    params = [param for layer in param_layers for param in layer.params]
    param_shapes=  [param for layer in param_layers for param in layer.param_shapes]                                
    
    param_grads = [T.grad(cost, param) for param in params]
        
    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Eg:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]
    
    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Ex:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]        
    
    new_egs = [
        args.rho * eg + (1 - args.rho) * g ** 2
        for eg, g in zip(egs, param_grads)
    ]
        
    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]    
    
    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx ** 2)
        for ex, dx in zip(exs, delta_x)
    ]    
    
    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)
    ]

    updates = egs_updates + exs_updates + param_updates
    
    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd
    
    def make_train_func(cost, updates):
        return theano.function(inputs = [batch_index],
                               outputs = [cost], 
                               updates = updates,
                               givens = {
                                   x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                                   y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
                               }
        )        

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(inputs = [],
                               outputs = errors, 
                               givens = {
                                   x: x_val,
                                   y: y_val
                               }, 
                           )
        
    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)
    

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm
    
    train_data_at_index = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
    }

    train_data_at_index_with_y = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
        y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs = [batch_index],
            outputs = dropout_cost,
            givens = {
                x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
            }
        )
        
    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(
            inputs = [],
            outputs = L2_sqr
        )

        get_L2_sqr_no_ebd = theano.function(
            inputs = [],
            outputs = L2_sqr_no_ebd
        )
        
    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs = [], 
            outputs = [theano.printing.Print(param.name)(
                T.mean(T.abs_(param_grad))
            )
                       for param, param_grad in zip(params, param_grads)
                   ], 
            givens = {
                x: train_set_x,
                y: train_set_y
            }
        )

    activations = [
        l.output
        for l in dropout_layers[1:-1]
    ]
    weight_grads = [
        T.grad(cost, l.W)
        for l in dropout_layers[1:-1]
    ]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in activations
            ], 
            givens = train_data_at_index
        )

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )
        
    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer                
        
        get_activation_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )

        get_activation_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )


    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )

        get_weight_grad_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )        
    
    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
                                  
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0
    
    nnls = []
    L2_sqrs = []
    
    activation_means = [[] for i in xrange(args.conv_layer_n)]
    activation_stds = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_means = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_stds = [[] for i in xrange(args.conv_layer_n)]
    activation_hist_data = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in xrange(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" %validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data        
            train_set_x_data = train_set_x.get_value(borrow = True)
            train_set_y_data = train_set_y.get_value(borrow = True)        
            
            permutation = np.random.permutation(train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in xrange(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)


                iter = (epoch - 1) * n_train_batches + minibatch_index
                
                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in xrange(n_train_batches)])
                    dev_error_val = dev_error()
                    
                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch, 
                    #     minibatch_index,
                    #     train_error_val * 100, 
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" %(
                        epoch, 
                        minibatch_index,
                        dev_error_val * 100
                    )
                    
                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)
                    
                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(
                           (
                               '     epoch %i, minibatch %i/%i, test error of'
                                ' best dev error %f %%'
                            ) %
                            (
                                epoch,
                                minibatch_index + 1,
                                n_train_batches,
                                test_error_val * 100.
                            )
                        )

                        print "Dumping model to %s" %(args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index+1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" %(minibatch_index + 1, n_train_batches)                
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" %(np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" %(np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []                                                                            
                    
                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)
                    
                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())
                    
                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(activation_means, activation_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)
                    
                    for layer_ms, layer_ss, layer_m, layer_s in zip(weight_grad_means, weight_grad_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(activation_hist_data , get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(weight_grad_hist_data , get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()
                                    
    except:
        import traceback
        traceback.print_exc(file = sys.stdout)
    finally:
        from plot_util import (plot_hist, 
                               plot_track, 
                               plot_error_vs_epoch, 
                               plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, 
                          activation_stds, 
                          "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, 
                          weight_grad_stds,
                          "weight_grad_tracking")
            
        if print_config["activation_hist"]:        
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(train_errors, dev_errors, 
                                     title = ('Best dev score: %f %% '
                                              ' at iter %i with test error %f %%') %(
                                                  best_validation_loss * 100., best_iter + 1, test_error_val * 100.
                                              )
            )
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")
    
    end_time = time.clock()
    
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))
    
    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %(args.task_signature, best_validation_loss, test_error_val))
        
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #5
0
    def __init__(self, x, y, batch_size, videos, kernels, pools, n_input, n_output, hidden_input, params=None):
        learning_rate = 0.1
        rng = numpy.random.RandomState(1234)

        print '... building the model'
        sys.stdout.flush()

        if not params:
            # Construct the first convolutional pooling layer:
            # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
            # maxpooling reduces this further to (24/2,24/2) = (12,12)
            # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
            layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0],
                               batch_size, 'L0', rng)

            layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1],
                               batch_size, 'L1', rng)

            layer2_input = layer1.output.flatten(2)

            # construct a fully-connected sigmoidal layer
            layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input,
                                 n_out=batch_size, activation=T.tanh)

            # classify the values of the fully-connected sigmoidal layer
            layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2)
        else:

            layer0 = ConvLayer(x, n_input[0], n_output[0], kernels[0], videos[0], pools[0],
                               batch_size, 'L0', rng, True, params[6], params[7])

            layer1 = ConvLayer(layer0.output, n_input[1], n_output[1], kernels[1], videos[1], pools[1],
                               batch_size, 'L1', rng, True, params[4], params[5])

            layer2_input = layer1.output.flatten(2)

            # construct a fully-connected sigmoidal layer
            layer2 = HiddenLayer(rng, input=layer2_input, n_in=hidden_input,
                                 n_out=batch_size, activation=T.tanh, W=params[2], b=params[3])

            # classify the values of the fully-connected sigmoidal layer
            layer3 = LogisticRegression(input=layer2.output, n_in=batch_size, n_out=2, W=params[0], b=params[1])

        # the cost we minimize during training is the NLL of the model
        cost = layer3.negative_log_likelihood(y)

        # create a list of all model parameters to be fit by gradient descent
        self.params = layer3.params + layer2.params + layer1.params + layer0.params

        # create a list of gradients for all model parameters
        grads = T.grad(cost, self.params)

        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i],grads[i]) pairs.
        updates = []
        for param_i, grad_i in zip(self.params, grads):
            updates.append((param_i, param_i - learning_rate * grad_i))

        self.train_model = theano.function([x, y], cost, updates=updates)
        self.validate_model = theano.function(inputs=[x, y], outputs=layer3.errors(y))
        self.predict = theano.function(inputs=[x], outputs=layer3.y_pred)

        print '... building done'
        sys.stdout.flush()
Example #6
0
np_l = LogisticRegression(W, b)

#########################
# THEANO PART
#########################

x_symbol = theano.tensor.dmatrix('x')
y_symbol = theano.tensor.ivector('y')

th_l = TheanoLogisticRegression(rng=np.random.RandomState(1234),
                                input=x_symbol,
                                n_in=10,
                                n_out=5,
                                W=theano.shared(value=W, name="W"),
                                b=theano.shared(value=b, name="b"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.nnl(y_symbol))

actual = np_l.nnl(x, y)
expected = f1(x, y)

assert_matrix_eq(actual, expected, "nnl")

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=th_l.errors(y_symbol))

actual = np_l.errors(x, y)
expected = f2(x, y)

assert_matrix_eq(actual, expected, "errors")
Example #7
0
n_in = filter_shape[0] * k * embed_dm / 2
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX)

layer3 = LogisticRegression(rng=rng,
                            input=layer2.output.flatten(2),
                            n_in=n_in,
                            n_out=n_out,
                            W=theano.shared(value=W_logreg, name="W_logreg"),
                            b=theano.shared(value=b_logreg, name="b_logreg"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol))

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=layer3.errors(y_symbol))

f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x)

f_el = theano.function(inputs=[x_symbol], outputs=layer1.output)

f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output)

#########################
# NUMPY PART            #
#########################


class Params(object):
    pass
th_l = TheanoLogisticRegression(rng = np.random.RandomState(1234), 
                                input = x_symbol, 
                                n_in = 10, 
                                n_out = 5,
                                W = theano.shared(value = W, 
                                                  name = "W"), 
                                b = theano.shared(value = b, 
                                                  name = "b")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = th_l.nnl(y_symbol)
                 )

actual = np_l.nnl(x, y)
expected = f1(x, y)


assert_matrix_eq(actual, expected, "nnl")


f2 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = th_l.errors(y_symbol)
                 )

actual = np_l.errors(x, y)
expected = f2(x, y)

assert_matrix_eq(actual, expected, "errors")