Esempio n. 1
0
def evaluate_mnist_1(learning_rate=0.1,
                     n_epochs=100,
                     nkerns=[4, 6],
                     batch_size=2):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    rng = numpy.random.RandomState(3)
    xs = []
    ys = []
    # f = open('temp_value', 'r+')
    # f = open('out_10', 'r+')
    f = open('out_10_10', 'r+')

    while (1):
        line = f.readline()
        line2 = f.readline()
        if not line:
            break
        line = line.replace("\n", "")

        values = [float(i) for i in line.split()]
        value = float(line2)

        xs.append(values)
        ys.append(value)

    print(len(xs))
    print(len(xs[0]))
    print(len(ys))
    # print(ys)
    # print(xs)

    test_set_x, test_set_y = shared_dataset([xs, ys])
    valid_set_x, valid_set_y = shared_dataset([xs, ys])
    train_set_x, train_set_y = shared_dataset([xs, ys])

    # train_set_x, train_set_y = datasets[0]
    # valid_set_x, valid_set_y = datasets[1]
    # test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    batch_size = len(ys)
    # batch_size=1
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    # n_train_batches = 1
    # n_valid_batches = 1
    # n_test_batches = 1

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    # myprint=theano.function([x],x)
    # myprint([layer2_input])

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=20,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    prob = layer3.prob_y_given_x(y)

    f1 = open('weights', 'w+')
    print "layer 0 weights"
    for w in layer0.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    # print layer0.W.get_value()
    # print layer0.b.get_value()
    print "layer 1 weights"
    # print layer1.W.get_value()
    # print layer1.b.get_value()
    for w in layer1.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    print "layer 2 weights"
    # print layer2.W.get_value()
    w = layer2.W.get_value()
    # for d in w:
    #     print d
    for i in range(len(w[0])):
        for j in range(len(w)):
            f1.write(str(w[j][i]) + '\n')
    # print layer2.b.get_value()

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    prob_model = theano.function(
        [index],
        prob,
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    conv_model0 = theano.function(
        [index],
        layer0.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model0_conv = theano.function(
        [index],
        layer0.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    conv_model1 = theano.function(
        [index],
        layer1.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model1_conv = theano.function(
        [index],
        layer1.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model2 = theano.function(
        [index],
        layer2.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params
    # params = layer0.params + layer1.params + layer2.params + layer3.params

    # x_printed = theano.printing.Print('this is a very important value')(x)
    # f_with_print = theano.function([x], x_printed)
    # f_with_print(layer3.params)

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    val_grads = T.grad(cost, layer3.p_y_given_x)
    # print "AAAA"
    # theano.printing.debugprint(temp_grads)
    # print "AAAA"

    grad_model = theano.function(
        [index],
        grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    val_grad_model = theano.function(
        [index],
        val_grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []

    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    bestConvW = layer0.W.get_value()

    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index
            val_grads_ij = val_grad_model(minibatch_index)
            grads_ij = grad_model(minibatch_index)
            conv0_ij = conv_model0(minibatch_index)
            conv1_ij = conv_model1(minibatch_index)
            conv2_ij = conv_model2(minibatch_index)
            conv0_conv_ij = conv_model0_conv(minibatch_index)
            conv1_conv_ij = conv_model1_conv(minibatch_index)

            print 'training @ iter = ', iter
            print "last layer var grads"
            print val_grads_ij[0]

            # print "Layer 0 convolution"
            # for c in conv0_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            # print "Layer 1 convolution"
            # for c in conv1_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            probs = prob_model(minibatch_index)
            print "Probs"
            print probs
            # print "layer 0 grads"
            # print grads_ij[6]
            # print grads_ij[7]
            # print "layer 1 grads"
            # print grads_ij[4]
            # print grads_ij[5]
            # print "layer 2 grads"
            # print grads_ij[2]
            # print grads_ij[3]
            print "log reg layer grads"
            print grads_ij[0]
            print grads_ij[1]
            print "Layer 0 output"
            # for c in conv0_ij:
            #     for d in c:
            #         print d
            # print conv0_ij[0][0]
            print "Layer 1 output"
            # print conv1_ij[0][0]
            # for c in conv1_ij:
            #     for d in c:
            #         print d
            print "Layer 2 output"
            # for c in conv2_ij:
            #     print c
            cost_ij = train_model(minibatch_index)

            # for c in conv0_conv_ij[1]:
            #     print c
            #     print ""

            print "learning_rate"
            print learning_rate
            print "layer 0 weights"
            # print layer0.W.get_value()
            # print layer0.b.get_value()
            print "layer 1 weights"
            # print layer1.W.get_value()
            # print layer1.b.get_value()
            print "layer 2 weights"
            w = layer2.W.get_value()
            # print w[0]
            # print w[1]

            # for c in layer2.W.get_value():
            #     print c
            # print layer2.b.get_value()
            print "log reg layer weights"
            print layer3.W.get_value()
            print layer3.b.get_value()
            print "COST"
            print cost_ij

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    bestConvW = layer0.W.get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(
                        ('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % (epoch, minibatch_index + 1,
                                           n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    def __init__(self, D, M, Q, Domain_number, Hiddenlayerdim1,
                 Hiddenlayerdim2):

        self.Xlabel = T.matrix('Xlabel')

        self.X = T.matrix('X')
        N = self.X.shape[0]

        self.Weight = T.matrix('Weight')

        ker = kernel(Q)
        #mmd=MMD(M,Domain_number)
        mu_value = np.random.randn(M, D) * 1e-2
        Sigma_b_value = np.zeros((M, M))  # + np.log(0.01)

        Z_value = np.random.randn(M, Q)

        ls_value = np.zeros(Domain_number) + np.log(0.1)

        self.mu = theano.shared(value=mu_value, name='mu', borrow=True)
        self.Sigma_b = theano.shared(value=Sigma_b_value,
                                     name='Sigma_b',
                                     borrow=True)
        self.Z = theano.shared(value=Z_value, name='Z', borrow=True)
        self.ls = theano.shared(value=ls_value, name='ls', borrow=True)

        self.params = [self.mu, self.Sigma_b, self.Z, self.ls]

        self.hiddenLayer_x = HiddenLayer(rng=rng,
                                         input=self.X,
                                         n_in=D,
                                         n_out=Hiddenlayerdim1,
                                         activation=T.nnet.relu,
                                         number='_x')
        #self.hiddenLayer_hidden = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=Hiddenlayerdim1,n_out=Hiddenlayerdim2,activation=T.nnet.relu,number='_h')
        self.hiddenLayer_m = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_x.output,
                                         n_in=Hiddenlayerdim1,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_m')
        self.hiddenLayer_S = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_x.output,
                                         n_in=Hiddenlayerdim1,
                                         n_out=Q,
                                         activation=T.nnet.relu,
                                         number='_S')

        self.loc_params = []
        self.loc_params.extend(self.hiddenLayer_x.params)
        #self.loc_params.extend(self.hiddenLayer_hidden.params)
        self.loc_params.extend(self.hiddenLayer_m.params)
        self.loc_params.extend(self.hiddenLayer_S.params)

        self.local_params = {}
        for i in self.loc_params:
            self.local_params[str(i)] = i

        self.params.extend(ker.params)
        #self.params.extend(mmd.params)

        self.hyp_params = {}
        for i in [self.mu, self.Sigma_b, self.ls]:
            self.hyp_params[str(i)] = i

        self.Z_params = {}
        for i in [self.Z]:
            self.Z_params[str(i)] = i

        self.global_params = {}
        for i in self.params:
            self.global_params[str(i)] = i

        self.params.extend(self.hiddenLayer_x.params)
        #self.params.extend(self.hiddenLayer_hidden.params)
        self.params.extend(self.hiddenLayer_m.params)
        self.params.extend(self.hiddenLayer_S.params)

        self.wrt = {}
        for i in self.params:
            self.wrt[str(i)] = i

        m = self.hiddenLayer_m.output
        S_0 = self.hiddenLayer_S.output
        S_1 = T.exp(S_0)
        S = T.sqrt(S_1)

        from theano.tensor.shared_randomstreams import RandomStreams
        srng = RandomStreams(seed=234)
        eps_NQ = srng.normal((N, Q))
        eps_M = srng.normal((M, D))  #平均と分散で違う乱数を使う必要があるので別々に銘銘
        eps_ND = srng.normal((N, D))

        beta = T.exp(self.ls)
        #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある

        Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) +
                       T.diag(T.exp(T.diag(self.Sigma_b))))

        #スケール変換
        mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma

        Xtilda = m + S * eps_NQ
        self.U = mu_scaled + Sigma_scaled.dot(eps_M)

        Kmm = ker.RBF(self.Z)
        #Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight)
        KmmInv = sT.matrix_inverse(Kmm)

        Kmn = ker.RBF(self.Z, Xtilda)
        #Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight)

        Knn = ker.RBF(Xtilda)
        #Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight)

        Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn))

        #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND)

        Kinterval = T.dot(KmmInv, Kmn)
        A = Kinterval.T
        Sigma_tilda = Ktilda + T.dot(A, T.dot(Sigma_scaled, A.T))
        mean_tilda = T.dot(A, mu_scaled)
        #mean_U=F
        #mean_U=T.dot(Kinterval.T,self.U)
        mean_U = mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5, eps_ND)
        betaI = T.diag(T.dot(self.Xlabel, beta))
        Covariance = betaI

        self.LL = self.log_mvn(
            self.X, mean_U, Covariance) / N  # - 0.5*T.sum(T.dot(betaI,Ktilda))
        self.KL_X = -self.KLD_X(m, S)
        self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset=DataSet,
                    nkerns=[cls1, cls2],
                    batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print type(train_set_x)

    #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540])
    #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540])
    #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540])

    #train_set_x = train_set_x / 100
    #valid_set_x = valid_set_x / 100
    #test_set_x = test_set_x / 100

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    #n_test_batches /= batch_size
    n_test_batches = (n_test_batches /
                      batch_size) + (n_test_batches % batch_size > 0)

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    Alr = T.scalar('Alr')
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (27, 10)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    xinp = x[:, :540]
    layer0_input = xinp.reshape((batch_size, 2, 27, 10))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 2, 27, 10),
                                filter_shape=(nkerns[0], 2, fsx, fsy),
                                poolsize=(p1, p1))
    cl2x = (27 - fsx + 1) / p1
    cl2y = (10 - fsy + 1) / p1
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)

    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], cl2x,
                                             cl2y),
                                filter_shape=(nkerns[1], nkerns[0], fsx, fsy),
                                poolsize=(p2, p2))
    hl1 = ((cl2x - fsx + 1) / p2) * ((cl2y - fsy + 1) / p2)
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    layer2_inputT = T.concatenate([layer2_input, x[:, 540:]], axis=1)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_inputT,
                         n_in=(nkerns[1] * hl1 * 1) + 12,
                         n_out=nhu1,
                         activation=T.tanh)

    layer22 = HiddenLayer(rng,
                          input=layer2.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer22.output, n_in=nhu1, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    #yPred = layer3.ypred(layer2.output)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index], [layer3.errors(y), layer3.y_pred],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        #updates.append((param_i, param_i - learning_rate * grad_i))
        updates.append((param_i, param_i - Alr * grad_i))

    train_model = theano.function(
        [index, Alr],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size][:],
            y: train_set_y[index * batch_size:(index + 1) * batch_size][:]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    #best_params = None
    best_params = []
    best_validation_loss = numpy.inf
    prev_validation_loss = 200

    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    Alrc = 0.1
    AlrE = 0.00001
    epochC = 0
    epoch = 0
    done_looping = False
    for param in params:
        best_params.append(param.get_value())
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        epochC = epochC + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index, Alrc)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                lossratio = (this_validation_loss -
                             prev_validation_loss) / (prev_validation_loss + 1)
                print(lossratio)
                print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100., Alrc))

                # if we got the best validation score until now
                #if this_validation_loss < best_validation_loss:
                if lossratio <= 0.0:
                    for i in range(len(params)):
                        best_params[i] = params[i].get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    prev_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #tm =  test_model(0)

                    yP = numpy.asarray([])
                    test_losses = [
                        test_model(i)[0] for i in xrange(n_test_batches)
                    ]
                    for i in xrange(n_test_batches):
                        yP = numpy.concatenate((yP, test_model(i)[1]))
                    print yP.shape
                    test_score = numpy.mean(test_losses)

                    #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value())
                    y = test_set_y.owner.inputs[0].get_value()

                    I1 = numpy.nonzero(y == 0.0)
                    I2 = numpy.nonzero(y == 1.0)
                    I3 = numpy.nonzero(y == 2.0)
                    I11 = numpy.nonzero(yP[I1[0]] == 0)
                    I12 = numpy.nonzero(yP[I1[0]] == 1)
                    I13 = numpy.nonzero(yP[I1[0]] == 2)
                    I21 = numpy.nonzero(yP[I2[0]] == 0)
                    I22 = numpy.nonzero(yP[I2[0]] == 1)
                    I23 = numpy.nonzero(yP[I2[0]] == 2)
                    I31 = numpy.nonzero(yP[I3[0]] == 0)
                    I32 = numpy.nonzero(yP[I3[0]] == 1)
                    I33 = numpy.nonzero(yP[I3[0]] == 2)

                    acc1 = float(float(I11[0].size) / float(I1[0].size))
                    acc2 = float(float(I22[0].size) / float(I2[0].size))
                    if n_out == 3:
                        acc3 = float(float(I33[0].size) / float(I3[0].size))
                    else:
                        acc3 = 0
                    print((
                        '     epoch %i, minibatch %i/%i, test error of '
                        'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, I11 = %i, I12 = %i, I13 = %i, I21 = %i, I22 = %i, I23 = %i, I31 = %i, I32 = %i, I33 = %i %%'
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100., acc1 * 100., acc2 * 100.,
                         acc3 * 100, I11[0].size, I12[0].size, I13[0].size,
                         I21[0].size, I22[0].size, I23[0].size, I31[0].size,
                         I32[0].size, I33[0].size))

                    #print(('     epoch %i, minibatch %i/%i, test error of best '
                    #       'model %f %%') %
                    #      (epoch, minibatch_index + 1, n_train_batches,
                    #       test_score * 100.))
                else:
                    if Alrc <= AlrE:
                        done_looping = True
                        break
                    elif epochC > 40:
                        Alrc = Alrc / 2
                        for param, best_param in zip(params, best_params):
                            param.set_value(best_param)
                        epochC = 0
            #if patience <= iter:
            #    done_looping = True
            #    break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Esempio n. 4
0
File: cnn.py Progetto: MartinHua/cnn
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    nkerns=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]

    n_train_batches //= batch_size
    n_valid_batches //= batch_size

    index = T.lscalar()

    x = T.matrix('x')
    y = T.ivector('y')

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng=rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng=rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    layer2 = HiddenLayer(rng=rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=300,
                         activation=T.tanh)

    layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10)

    cost = layer3.negative_log_likelihood(y)

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    params = layer3.params + layer2.params + layer1.params + layer0.params

    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995

    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = numpy.inf
    best_iter = 0

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if this_validation_loss < best_validation_loss:

                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    fo = open('best_cnn_model.pkl', 'wb')
                    pickle.dump([[layer0.W, layer0.b], [layer1.W, layer1.b],
                                 [layer2.W, layer2.b], [layer3.W, layer3.b]],
                                fo)
                    fo.close()
            if patience <= iter:
                done_looping = True
                break

    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, ' %
          (best_validation_loss * 100., best_iter + 1))
Esempio n. 5
0
def sgd_optimization_mnist(learning_rate=2e-2, loss_weight = 1.8e+8, curriculum_rate=0.1, n_curriculum_epochs=300, epoch_iters = 20, converge = 1e-4,
                           minibatch_size = 50, 
                           batch_size=4, k = 4, func = 'concavefeature', func_parameter = 0.5, deep = True):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    print('loading data...')
    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    labels_, cluster_centers_, center_nn = datasets[3]
    num_cluster = cluster_centers_.shape[0]
    isize = int(numpy.sqrt(train_set_x.get_value(borrow=True).shape[1]))


    # compute number of minibatches for training, validation and testing
    n_train = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches = n_train // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('building the model...')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    cindex = T.lvector()  # index to a [mini]batch


    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    if deep is False:

        # construct the logistic regression class
        # Each MNIST image has size 28*28
        classifier = LogisticRegression(input=x, n_in=isize**2, n_out=10)

        # the cost we minimize during training is the negative log likelihood of
        # the model in symbolic format
        cost = classifier.negative_log_likelihood(y)
        cost_vec = classifier.negative_log_likelihood_vec(y)

        # compute the gradient of cost with respect to theta = (W,b)
        g_W = T.grad(cost=cost, wrt=classifier.W)
        g_b = T.grad(cost=cost, wrt=classifier.b)

        # start-snippet-3
        # specify how to update the parameters of the model as a list of
        # (variable, update expression) pairs.
        updates = [(classifier.W, classifier.W - learning_rate * g_W),
                   (classifier.b, classifier.b - learning_rate * g_b)]
    else:

        nfea = 500
        nkerns=[20, 50]
        n_channels = 1
        rng = numpy.random.RandomState(23455)

        layer0_input = x.reshape((-1, 1, isize, isize))

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
        layer0 = LeNetConvPoolLayer(
            rng,
            input=layer0_input,
            image_shape=(None, 1, isize, isize),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2)
        )

        isize1 = int((isize - 5 + 1)/2)

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        layer1 = LeNetConvPoolLayer(
            rng,
            input=layer0.output,
            image_shape=(None, nkerns[0], isize1, isize1),
            filter_shape=(nkerns[1], nkerns[0], 5, 5),
            poolsize=(2, 2)
        )

        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
        # or (500, 50 * 4 * 4) = (500, 800) with the default values.
        layer2_input = layer1.output.flatten(2)

        isize2 = int((isize1 - 5 + 1)/2)

        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(
            rng,
            input=layer2_input,
            n_in=nkerns[1] * isize2 * isize2,
            n_out=nfea,
            activation=T.tanh
        )

        # classify the values of the fully-connected sigmoidal layer
        classifier = LogisticRegression(input=layer2.output, n_in=nfea, n_out=10)

        # the cost we minimize during training is the NLL of the model
        cost = classifier.negative_log_likelihood(y)
        cost_vec = classifier.negative_log_likelihood_vec(y)

        # create a list of all model parameters to be fit by gradient descent
        params = classifier.params + layer2.params + layer1.params + layer0.params

        # create a list of gradients for all model parameters
        grads = T.grad(cost, params)

        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i], grads[i]) pairs.
        updates = [
            (param_i, param_i - learning_rate * grad_i)
            for param_i, grad_i in zip(params, grads)
        ]

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[cindex],
        outputs=classifier.errors(y),
        updates=updates,
        givens={
            x: train_set_x[cindex],
            y: train_set_y[cindex]
        }
    )

    loss_model = theano.function(
        inputs=[cindex],
        outputs=cost_vec,
        givens={
            x: train_set_x[cindex],
            y: train_set_y[cindex]
        }
    )
    error_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('training the model...')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    #validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    #initialize
    minGain, sinGain, optSubmodular = initSubmodularFunc(cluster_centers_, k)
    real_iter = 0
    validation_frequency = 100
    old_epoch_all_loss = float('inf')
    loss_weight0 = loss_weight
    passed_index = numpy.array([])
    passed_index_epoch = numpy.array([]) 
    passes = 0
    output_seq = ()
    for curriculum_epoch in range(n_curriculum_epochs):

        print('Epoch', curriculum_epoch)
        old_all_loss = 0
        for iters in range(epoch_iters):

            if len(passed_index) <= n_train*0.45:
                # compute loss
                loss_vec = loss_model(center_nn) * loss_weight / len(center_nn)
                all_loss = sum(loss_vec)
                #loss_vec_center = numpy.asarray([sum(loss_vec[labels_ == i]) for i in range(num_cluster)])
                loss_vec_center = loss_vec
                topkLoss = sum(numpy.partition(loss_vec_center, -k)[-k:])
                optObj = optSubmodular + topkLoss
                print(optSubmodular, topkLoss)

                # update A (topkIndex)
                left_index = pruneGroundSet(minGain, sinGain, loss_vec_center, k)
                topkIndex = modularLowerBound(cluster_centers_[left_index,:], k, func, func_parameter, loss_vec_center[left_index], optObj)
                topkIndex = left_index[topkIndex]

                # update classifier (train_model)           
                train_index = numpy.array([])
                for i in range(len(topkIndex)):
                    train_index = numpy.append(train_index, numpy.where(labels_ == topkIndex[i])[0])
                train_index = numpy.random.permutation(train_index.astype(int))
                print('number of training samples =', len(train_index))
                passes += len(train_index)
                passed_index = numpy.unique(numpy.append(passed_index, train_index))
                passed_index_epoch = numpy.unique(numpy.append(passed_index_epoch, train_index))

            else:

                train_index = numpy.random.permutation(numpy.setxor1d(numpy.arange(n_train), passed_index_epoch).astype(int))
                #train_index = numpy.random.permutation(numpy.arange(n_train).astype(int))
                passes += len(train_index)
                passed_index_epoch = numpy.array([])
                #passed_index = numpy.arange(n_train)
                
            # training by mini-batch sgd
            start_index = 0
            train_loss = numpy.array([])
            while start_index < len(train_index):
                end_index = min([start_index + minibatch_size, len(train_index)])
                batch_index = train_index[start_index : end_index]
                start_index = end_index
                train_loss = numpy.append(train_loss, train_model(batch_index))
            this_train_loss = numpy.mean(train_loss)

            # stop the current epoch if converge
            diff_loss = old_all_loss - all_loss
            if  diff_loss >= 0 and diff_loss <= all_loss * converge:
                break
            # show validation and test error peoriodically
            else:
                old_all_loss = all_loss
                if (iters + real_iter + 1) % validation_frequency == 0:
                    # compute zero-one loss on validation set
                    validation_losses = [validate_model(i)
                                         for i in range(n_valid_batches)]
                    this_validation_loss = numpy.mean(validation_losses)
                    test_losses = [test_model(i)
                                    for i in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    train_score = [error_model(i) 
                                    for i in range(n_train_batches)]
                    this_train_score = numpy.mean(train_score)

                    print(
                        'minibatch %i, %i trainings, %i passes, trainErr %f %%, validErr %f %%, testErr %f %%' %
                        (
                            iters + real_iter + 1,
                            len(passed_index),
                            passes,
                            this_train_score * 100.,
                            this_validation_loss * 100.,
                            test_score * 100.
                        )
                    )

                    output_seq = output_seq + (numpy.array([len(passed_index),passes,this_train_score * 100.,this_validation_loss * 100.,test_score * 100.]),)

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, (iters + real_iter + 1) * patience_increase)

                        best_validation_loss = this_validation_loss

                        # save the best model
                        with open('best_model.pkl', 'wb') as f:
                            pickle.dump(classifier, f)

        #print('Up to now %i training samples are used'%(len(passed_index)))
        # record total number of iterations
        real_iter += iters
        # adjust learning rate
        if all_loss > 1.001 * old_epoch_all_loss:
            print('no improvement: reduce learning rate!')
            learning_rate *= 0.96
        old_epoch_all_loss = all_loss
        # increase curriculum rate
        loss_weight *= curriculum_rate + 1

        if patience <= iters + real_iter + 1:
            break

    end_time = timeit.default_timer()
    print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    #print('The code run for %d epochs, with %f epochs/sec' % (
        #epoch, 1. * epoch / (end_time - start_time)))
    #print(('The code for file ' +
           #os.path.split(__file__)[1] +
           #' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
    output_seq = numpy.vstack(output_seq)
    return output_seq
Esempio n. 6
0
def test_dA_joint(learning_rate=0.01,
                  training_epochs=15000,
                  dataset='mnist.pkl.gz',
                  batch_size=5,
                  output_folder='dA_plots'):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    ##datasets = load_data(dataset)
    #from SdA_mapping import load_data_half
    #datasets = load_data_half(dataset)
    print 'loading data'
    datasets, x_mean, y_mean, x_std, y_std = load_vc()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print 'loaded data'

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x1 = T.matrix('x1')  # the data is presented as rasterized images
    x2 = T.matrix('x2')  # the data is presented as rasterized images
    cor_reg = T.scalar('cor_reg')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    #da = dA_joint(
    #numpy_rng=rng,
    #theano_rng=theano_rng,
    #input1=x1,
    #input2=x2,

    #n_visible1=28 * 28/2,
    #n_visible2=28 * 28/2,

    #n_hidden=500
    #)
    print 'initialize functions'

    da = dA_joint(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input1=x1,
        input2=x2,
        cor_reg=cor_reg,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,
        n_visible1=24,
        n_visible2=24,
        n_hidden=50)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)
    cor_reg_val = numpy.float32(5.0)
    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x1: train_set_x[index * batch_size:(index + 1) * batch_size],
            x2: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    fprop_x1 = theano.function([],
                               outputs=da.output1,
                               givens={x1: test_set_x},
                               name='fprop_x1')
    fprop_x2 = theano.function([],
                               outputs=da.output2,
                               givens={x2: test_set_y},
                               name='fprop_x2')
    fprop_x1t = theano.function([],
                                outputs=da.output1,
                                givens={x1: train_set_x},
                                name='fprop_x1')
    fprop_x2t = theano.function([],
                                outputs=da.output2,
                                givens={x2: train_set_y},
                                name='fprop_x2')
    rec_x1 = theano.function([],
                             outputs=da.rec1,
                             givens={x1: test_set_x},
                             name='rec_x1')
    rec_x2 = theano.function([],
                             outputs=da.rec2,
                             givens={x2: test_set_y},
                             name='rec_x2')
    fprop_x1_to_x2 = theano.function([],
                                     outputs=da.reg,
                                     givens={x1: test_set_x},
                                     name='fprop_x12x2')
    updates_reg = [(da.cor_reg, da.cor_reg + theano.shared(numpy.float32(0.1)))
                   ]
    update_reg = theano.function([], updates=updates_reg)
    print 'initialize functions ended'

    start_time = time.clock()

    ############
    # TRAINING #
    ############
    print 'training started'
    X1 = test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    X2 = test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    from dcca_numpy import cor_cost
    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        #cor_reg_val += 1
        #da.cor_reg = theano.shared(cor_reg_val)
        update_reg()

        X1H = rec_x1()
        X2H = rec_x2()
        X1H *= x_std
        X1H += x_mean
        X2H *= y_std
        X2H += y_mean
        H1 = fprop_x1()
        H2 = fprop_x2()
        print 'Training epoch'
        print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\
              numpy.mean(numpy.mean((X2H-X2)**2,1))

        if epoch % 5 == 2:  # pretrain middle layer
            print '... pre-training MIDDLE layer'
            H1t = fprop_x1t()
            H2t = fprop_x2t()
            h1 = T.matrix('x')  # the data is presented as rasterized images
            h2 = T.matrix('y')  # the labels are presented as 1D vector of
            from mlp import HiddenLayer
            numpy_rng = numpy.random.RandomState(89677)
            log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh)

            if 1:  # for middle layer
                learning_rate = 0.1

                #H1=theano.shared(H1)
                #H2=theano.shared(H2)
                # compute the gradients with respect to the model parameters
                logreg_cost = log_reg.mse(h2)

                gparams = T.grad(logreg_cost, log_reg.params)

                # compute list of fine-tuning updates
                updates = [(param, param - gparam * learning_rate)
                           for param, gparam in zip(log_reg.params, gparams)]

                train_fn_middle = theano.function(inputs=[],
                                                  outputs=logreg_cost,
                                                  updates=updates,
                                                  givens={
                                                      h1: theano.shared(H1t),
                                                      h2: theano.shared(H2t)
                                                  },
                                                  name='train_middle')
            epoch = 0
            while epoch < 100:
                print epoch, train_fn_middle()
                epoch += 1

            ##X2H=fprop_x1_to_x2()
            X2H = numpy.tanh(H1.dot(log_reg.W.eval()) + log_reg.b.eval())
            X2H = numpy.tanh(X2H.dot(da.W2_prime.eval()) + da.b2_prime.eval())

            X2H *= y_std
            X2H += y_mean
            print 'Regression ', numpy.mean(numpy.mean((X2H - X2)**2, 1))

        print 'Correlation ', cor_cost(H1, H2)
    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    image = Image.fromarray(
        tile_raster_images(X=da.W1.get_value(borrow=True).T,
                           img_shape=(28, 14),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')

    from matplotlib import pyplot as pp
    pp.plot(H1[:10, :2], 'b')
    pp.plot(H2[:10, :2], 'r')
    pp.show()

    print cor
Esempio n. 7
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=247,
                 hidden_layers_sizes=[200],
                 n_outs=100,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(123))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        #self.x = T.vector('x')

        # end-snippet-1

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        # start-snippet-2
        for i in range(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        # end-snippet-2
        '''
Esempio n. 8
0
 def __init__(self, config, verbose=True):
     '''
     @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder.
     '''
     self.encoder = GrCNNEncoder(config, verbose)
     # Link two parts
     self.input = self.encoder.input
     # Activation function
     self.act = Activation(config.activation)
     # Extract the hierarchical representation, the pyramids, from the encoder
     # Combine the original time series and the compressed time series
     self.pyramids = self.encoder.pyramids
     self.pyramids = T.concatenate([
         self.encoder.hidden0.dimshuffle('x', 0, 1), self.encoder.pyramids
     ])
     self.nsteps = self.pyramids.shape[0]
     # Use another scan function to compress each hierarchical representation
     # into the vector representation
     self.hierarchies, _ = theano.scan(
         fn=self._step_compress,
         sequences=[T.arange(self.nsteps, 0, -1), self.pyramids])
     # Global classifier, MLP, mixture of experts
     self.hidden_layer = HiddenLayer(self.hierarchies,
                                     (config.num_hidden, config.num_mlp),
                                     act=Activation(config.hiddenact))
     # Adding dropout support
     self.hidden = self.hidden_layer.output
     srng = T.shared_randomstreams.RandomStreams(config.random_seed)
     mask = srng.binomial(n=1, p=1 - config.dropout, size=self.hidden.shape)
     self.hidden *= T.cast(mask, floatX)
     # Connect the hidden layer after dropout to a logistic output layer
     self.output_layer = LogisticLayer(self.hidden, config.num_mlp)
     self.experts = self.output_layer.output
     # Global weighting mechanism, voting weights
     self.weight_layer = theano.shared(
         name='Weighting vector',
         value=np.random.rand(config.num_hidden).astype(floatX))
     self.weights = T.nnet.softmax(
         T.dot(self.hierarchies, self.weight_layer))
     # Compute the total number of parameters in the model
     self.num_params = self.encoder.num_params + self.hidden_layer.num_params + \
                       self.output_layer.num_params + config.num_hidden
     # Final decision, bagging
     self.score = T.sum(T.flatten(self.experts) * T.flatten(self.weights))
     # Prediction for classification
     self.pred = self.score >= 0.5
     # Stack all the parameters
     self.params = []
     self.params += self.encoder.params
     self.params += self.hidden_layer.params
     self.params += self.output_layer.params
     self.params += [self.weight_layer]
     # Build objective function for binary classification problem
     self.truth = T.iscalar(name='label')
     self.cost = -self.truth * T.log((self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps)) - \
                 (1-self.truth) * T.log((1.0-self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps))
     ## Weight Decay
     if config.weight_decay:
         self.regularizer = self.encoder.L2_loss() + self.hidden_layer.L2_loss() + \
                            self.output_layer.L2_loss() + T.sum(self.weight_layer ** 2)
         self.regularizer *= config.weight_decay_parameter
         self.cost += self.regularizer
     # Construct gradient vectors
     self.gradparams = T.grad(self.cost, self.params)
     # Construct gradient for the input matrix, fine-tuning
     self.input_grads = T.grad(self.cost, self.input)
     # Build and compile theano functions
     self.predict = theano.function(inputs=[self.input], outputs=self.pred)
     self.bagging = theano.function(inputs=[self.input], outputs=self.score)
     self.compute_gradient_and_cost = theano.function(
         inputs=[self.input, self.truth],
         outputs=self.gradparams + [self.cost, self.pred])
     self.compute_input_gradient = theano.function(
         inputs=[self.input, self.truth], outputs=self.input_grads)
     # Theano functions for debugging purposes
     self.show_weights = theano.function(inputs=[self.input],
                                         outputs=self.weights)
     self.show_scores = theano.function(inputs=[self.input],
                                        outputs=self.experts)
     self.show_hierarchy = theano.function(inputs=[self.input],
                                           outputs=self.hierarchies)
     self.show_prob = theano.function(inputs=[self.input],
                                      outputs=self.score)
     self.show_cost = theano.function(inputs=[self.input, self.truth],
                                      outputs=self.cost)
     if verbose:
         logger.debug('GrCNNBagger built finished...')
         logger.debug(
             'Hierarchical structure of GrCNN for classification...')
         logger.debug('Total number of parameters in the model: %d' %
                      self.num_params)
    def __init__(self,
                 numpy_rng=None,
                 useRelu=None,
                 W_distribution=None,
                 LayerNodes=None,
                 dropout=None):

        self.n_layers = (len(LayerNodes) - 2)
        self.dA_layers = []
        self.dropout_layers = []
        self.layers = []
        self.x = T.matrix('x')
        self.y = T.ivector('y')
        next_layer_input = self.x
        next_dropout_layer_input = _dropout_from_layer(numpy_rng,
                                                       self.x,
                                                       p=dropout[0])

        weight_matrix_sizes = zip(LayerNodes, LayerNodes[1:])
        layer_counter = 0

        for n_in, n_out in weight_matrix_sizes[:-1]:
            if useRelu == True:
                activation = relu
                activation1 = relu
                if layer_counter == 0:
                    activation2 = T.nnet.sigmoid
                else:
                    activation2 = T.nnet.softplus
            else:
                activation = T.nnet.sigmoid
                activation1 = T.nnet.sigmoid
                activation2 = T.nnet.sigmoid

            W_bound = 4. * numpy.sqrt(6. / (n_in + n_out))

            next_dropout_layer = DropoutHiddenLayer(
                numpy_rng=numpy_rng,
                input=next_dropout_layer_input,
                activation=activation,
                n_in=n_in,
                n_out=n_out,
                W_distribution=W_distribution,
                W_bound=W_bound,
                dropout_rate=dropout[layer_counter + 1])
            self.dropout_layers.append(next_dropout_layer)
            next_dropout_layer_input = next_dropout_layer.output

            next_layer = HiddenLayer(numpy_rng=numpy_rng,
                                     input=next_layer_input,
                                     activation=activation,
                                     n_in=n_in,
                                     n_out=n_out,
                                     W=next_dropout_layer.W *
                                     (1 - dropout[layer_counter]),
                                     b=next_dropout_layer.b)
            self.layers.append(next_layer)

            dA_layer = dA(numpy_rng=numpy_rng,
                          input=next_layer_input,
                          useRelu=useRelu,
                          activation1=activation1,
                          activation2=activation2,
                          n_visible=n_in,
                          n_hidden=n_out,
                          W=next_dropout_layer.W,
                          b=next_dropout_layer.b)
            self.dA_layers.append(dA_layer)
            next_layer_input = next_layer.output

            if layer_counter == 0:
                self.L1 = abs(next_dropout_layer.W).sum()
                self.L2 = (next_dropout_layer.W**2).sum()
            else:
                self.L1 = self.L1 + abs(next_dropout_layer.W).sum()
                self.L2 = self.L2 + (next_dropout_layer.W**2).sum()
            layer_counter += 1

        n_in, n_out = weight_matrix_sizes[-1]
        dropout_output_layer = LogisticRegression(
            input=next_dropout_layer_input, n_in=n_in, n_out=n_out)

        self.dropout_layers.append(dropout_output_layer)
        self.L1 = self.L1 + abs(dropout_output_layer.W).sum()
        self.L2 = self.L2 + (dropout_output_layer.W**2).sum()

        self.dropout_negative_log_likelihood = self.dropout_layers[
            -1].negative_log_likelihood(self.y)
        output_layer = LogisticRegression(input=next_layer_input,
                                          n_in=n_in,
                                          n_out=n_out,
                                          W=dropout_output_layer.W *
                                          (1 - dropout[-1]),
                                          b=dropout_output_layer.b)

        self.layers.append(output_layer)
        self.error = self.layers[-1].error(self.y)
        self.sensitivity = self.layers[-1].sensitivity(self.y)
        self.specificity = self.layers[-1].specificity(self.y)
        self.class1_pred = self.layers[-1].class1_pred(self.y)
        self.params = [
            param for layer in self.dropout_layers for param in layer.params
        ]
Esempio n. 10
0
 def __init__(self, config=None, verbose=True):
     # Construct two GrCNNEncoders for matching two sentences
     self.encoderL = GrCNNEncoder(config, verbose)
     self.encoderR = GrCNNEncoder(config, verbose)
     # Link two parts
     self.params = []
     self.params += self.encoderL.params
     self.params += self.encoderR.params
     self.inputL = self.encoderL.input
     self.inputR = self.encoderR.input
     # Get output of two GrCNNEncoders
     self.hiddenL = self.encoderL.output
     self.hiddenR = self.encoderR.output
     # Activation function
     self.act = Activation(config.activation)
     # MLP Component
     self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1)
     self.hidden_layer = HiddenLayer(
         self.hidden, (2 * config.num_hidden, config.num_mlp),
         act=Activation(config.hiddenact))
     self.compressed_hidden = self.hidden_layer.output
     # Accumulate parameters
     self.params += self.hidden_layer.params
     # Dropout parameter
     srng = T.shared_randomstreams.RandomStreams(config.random_seed)
     mask = srng.binomial(n=1,
                          p=1 - config.dropout,
                          size=self.compressed_hidden.shape)
     self.compressed_hidden *= T.cast(mask, floatX)
     # Use concatenated vector as input to the logistic regression classifier
     self.logistic_layer = LogisticLayer(self.compressed_hidden,
                                         config.num_mlp)
     self.output = self.logistic_layer.output
     self.pred = self.logistic_layer.pred
     # Accumulate parameters
     self.params += self.logistic_layer.params
     # Compute the total number of parameters in this model
     self.num_params_encoder = config.num_input * config.num_hidden + \
                               config.num_hidden * config.num_hidden * 2 + \
                               config.num_hidden + \
                               config.num_hidden * 3 * 2 + \
                               3
     self.num_params_encoder *= 2
     self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \
                                  config.num_mlp + \
                                  config.num_mlp + 1
     self.num_params = self.num_params_encoder + self.num_params_classifier
     # Build target function
     self.truth = T.ivector(name='label')
     self.learn_rate = T.scalar(name='learning rate')
     self.cost = self.logistic_layer.NLL_loss(self.truth)
     # Build computational graph and compute the gradient of the target function
     # with respect to model parameters
     self.gradparams = T.grad(self.cost, self.params)
     # Updates formula for stochastic descent algorithm
     self.updates = []
     for param, gradparam in zip(self.params, self.gradparams):
         self.updates.append((param, param - self.learn_rate * gradparam))
     # Compile theano function
     self.objective = theano.function(
         inputs=[self.inputL, self.inputR, self.truth], outputs=self.cost)
     self.predict = theano.function(inputs=[self.inputL, self.inputR],
                                    outputs=self.pred)
     # Compute the gradient of the objective function with respect to the model parameters
     self.compute_cost_and_gradient = theano.function(
         inputs=[self.inputL, self.inputR, self.truth],
         outputs=self.gradparams + [self.cost, self.pred])
     # Output function for debugging purpose
     self.show_hidden = theano.function(
         inputs=[self.inputL, self.inputR, self.truth], outputs=self.hidden)
     self.show_compressed_hidden = theano.function(
         inputs=[self.inputL, self.inputR, self.truth],
         outputs=self.compressed_hidden)
     self.show_output = theano.function(
         inputs=[self.inputL, self.inputR, self.truth], outputs=self.output)
     if verbose:
         logger.debug(
             'Architecture of GrCNNMatcher built finished, summarized below: '
         )
         logger.debug('Input dimension: %d' % config.num_input)
         logger.debug('Hidden dimension inside GrCNNMatcher pyramid: %d' %
                      config.num_hidden)
         logger.debug('Hidden dimension of MLP: %d' % config.num_mlp)
         logger.debug('Number of parameters in encoder part: %d' %
                      self.num_params_encoder)
         logger.debug('Number of parameters in classifier part: %d' %
                      self.num_params_classifier)
         logger.debug('Number of total parameters in this model: %d' %
                      self.num_params)
Esempio n. 11
0
    def __init__(self, config=None, verbose=True):
        # Construct two GrCNNEncoders for matching two sentences
        self.encoderL = ExtGrCNNEncoder(config, verbose)
        self.encoderR = ExtGrCNNEncoder(config, verbose)
        # Link the parameters of two parts
        self.params = []
        self.params += self.encoderL.params
        self.params += self.encoderR.params
        # Build three kinds of inputs:
        # 1, inputL, inputR. This pair is used for computing the score after training
        # 2, inputPL, inputPR. This part is used for training positive pairs
        # 3, inputNL, inputNR. This part is used for training negative pairs
        self.inputL = self.encoderL.input
        self.inputR = self.encoderR.input
        # Positive
        self.inputPL = T.matrix(name='inputPL', dtype=floatX)
        self.inputPR = T.matrix(name='inputPR', dtype=floatX)
        # Negative
        self.inputNL = T.matrix(name='inputNL', dtype=floatX)
        self.inputNR = T.matrix(name='inputNR', dtype=floatX)
        # Linking input-output mapping
        self.hiddenL = self.encoderL.output
        self.hiddenR = self.encoderR.output
        # Positive
        self.hiddenPL = self.encoderL.encode(self.inputPL)
        self.hiddenPR = self.encoderR.encode(self.inputPR)
        # Negative
        self.hiddenNL = self.encoderL.encode(self.inputNL)
        self.hiddenNR = self.encoderR.encode(self.inputNR)
        # Activation function
        self.act = Activation(config.activation)
        # MLP Component
        self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1)
        self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=1)
        self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=1)
        # Build hidden layer
        self.hidden_layer = HiddenLayer(
            self.hidden, (2 * config.num_hidden, config.num_mlp),
            act=Activation(config.hiddenact))
        self.compressed_hidden = self.hidden_layer.output
        self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP)
        self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN)
        # Accumulate parameters
        self.params += self.hidden_layer.params
        # Dropout parameter
        srng = T.shared_randomstreams.RandomStreams(config.random_seed)
        mask = srng.binomial(n=1,
                             p=1 - config.dropout,
                             size=self.compressed_hidden.shape)
        maskP = srng.binomial(n=1,
                              p=1 - config.dropout,
                              size=self.compressed_hiddenP.shape)
        maskN = srng.binomial(n=1,
                              p=1 - config.dropout,
                              size=self.compressed_hiddenN.shape)
        self.compressed_hidden *= T.cast(mask, floatX)
        self.compressed_hiddenP *= T.cast(maskP, floatX)
        self.compressed_hiddenN *= T.cast(maskN, floatX)
        # Score layers
        self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp)
        self.output = self.score_layer.output
        self.scoreP = self.score_layer.encode(self.compressed_hiddenP)
        self.scoreN = self.score_layer.encode(self.compressed_hiddenN)
        # Accumulate parameters
        self.params += self.score_layer.params
        # Build cost function
        self.cost = T.mean(
            T.maximum(T.zeros_like(self.scoreP),
                      1.0 - self.scoreP + self.scoreN))
        # Construct the gradient of the cost function with respect to the model parameters
        self.gradparams = T.grad(self.cost, self.params)
        # Compute the total number of parameters in the model
        self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params
        self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \
                                     config.num_mlp + \
                                     config.num_mlp + 1
        self.num_params = self.num_params_encoder + self.num_params_classifier
        # Build class methods
        self.score = theano.function(inputs=[self.inputL, self.inputR],
                                     outputs=self.output)
        self.compute_cost_and_gradient = theano.function(
            inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR],
            outputs=self.gradparams + [self.cost, self.scoreP, self.scoreN])
        self.show_scores = theano.function(
            inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR],
            outputs=[self.scoreP, self.scoreN])
        self.show_hiddens = theano.function(
            inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR],
            outputs=[self.hiddenP, self.hiddenN])
        self.show_inputs = theano.function(
            inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR],
            outputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR])

        if verbose:
            logger.debug(
                'Architecture of ExtGrCNNMatchScorer built finished, summarized below: '
            )
            logger.debug('Input dimension: %d' % config.num_input)
            logger.debug(
                'Hidden dimension inside GrCNNMatchScorer pyramid: %d' %
                config.num_hidden)
            logger.debug('Hidden dimension MLP: %d' % config.num_mlp)
            logger.debug('Number of Gating functions: %d' % config.num_gates)
            logger.debug('There are 2 ExtGrCNNEncoders used in model.')
            logger.debug('Total number of parameters used in the model: %d' %
                         self.num_params)
Esempio n. 12
0
 def __init__(self, config=None, verbose=True):
     self.encoder = GrCNNEncoder(config, verbose)
     # Link two parts
     self.params = self.encoder.params
     self.input = self.encoder.input
     self.hidden = self.encoder.output
     # Activation function
     self.act = Activation(config.activation)
     # MLP Component
     self.hidden_layer = HiddenLayer(self.hidden,
                                     (config.num_hidden, config.num_mlp),
                                     act=Activation(config.hiddenact))
     self.compressed_hidden = self.hidden_layer.output
     # Dropout regularization
     srng = T.shared_randomstreams.RandomStreams(config.random_seed)
     mask = srng.binomial(n=1,
                          p=1 - config.dropout,
                          size=self.compressed_hidden.shape)
     self.compressed_hidden *= T.cast(mask, floatX)
     # Accumulate model parameters
     self.params += self.hidden_layer.params
     # Softmax Component
     self.softmax_layer = SoftmaxLayer(self.compressed_hidden,
                                       (config.num_mlp, config.num_class))
     self.raw_output = self.softmax_layer.output
     self.pred = self.softmax_layer.pred
     self.params += self.softmax_layer.params
     # Compute the total number of parameters in this model
     self.num_params_encoder = config.num_input * config.num_hidden + \
                               config.num_hidden * config.num_hidden * 2 + \
                               config.num_hidden + \
                               config.num_hidden * 3 * 2 + \
                               3
     self.num_params_classifier = config.num_hidden * config.num_mlp + \
                                  config.num_mlp + \
                                  config.num_mlp * config.num_class + \
                                  config.num_class
     self.num_params = self.num_params_encoder + self.num_params_classifier
     # Build target function
     self.truth = T.ivector(name='label')
     self.learn_rate = T.scalar(name='learning rate')
     self.cost = self.softmax_layer.NLL_loss(self.truth)
     # Build computational graph and compute the gradient of the target
     # function with respect to model parameters
     self.gradparams = T.grad(self.cost, self.params)
     # Updates formula for stochastic gradient descent algorithm
     self.updates = []
     for param, gradparam in zip(self.params, self.gradparams):
         self.updates.append((param, param - self.learn_rate * gradparam))
     # Compile theano function
     self.objective = theano.function(inputs=[self.input, self.truth],
                                      outputs=self.cost)
     self.predict = theano.function(inputs=[self.input], outputs=self.pred)
     # Compute the gradient of the objective function with respect to the model parameters
     self.compute_cost_and_gradient = theano.function(
         inputs=[self.input, self.truth],
         outputs=self.gradparams + [self.cost])
     # Output function for debugging purpose
     self.show_hidden = theano.function(inputs=[self.input, self.truth],
                                        outputs=self.hidden)
     self.show_compressed_hidden = theano.function(
         inputs=[self.input, self.truth], outputs=self.compressed_hidden)
     self.show_output = theano.function(inputs=[self.input, self.truth],
                                        outputs=self.raw_output)
     if verbose:
         logger.debug(
             'Architecture of GrCNN built finished, summarized as below: ')
         logger.debug('Input dimension: %d' % config.num_input)
         logger.debug('Hidden dimension inside GrCNNEncoder pyramid: %d' %
                      config.num_hidden)
         logger.debug('Hidden dimension of MLP: %d' % config.num_mlp)
         logger.debug('Number of target classes: %d' % config.num_class)
         logger.debug('Number of parameters in encoder part: %d' %
                      self.num_params_encoder)
         logger.debug('Number of parameters in classifier part: %d' %
                      self.num_params_classifier)
         logger.debug('Number of total parameters in this model: %d' %
                      self.num_params)
Esempio n. 13
0
def test_CNN(learning_rate=0.01,
             L1_reg=0.00,
             L2_reg=0.0001,
             n_epochs=1000,
             batch_size=20,
             n_hidden=500,
             dataset='txtData7.pkl'):
    dataset = load_data(dataset)
    ''' train_set_x.get_value(); tt.shape ---(50000, 784)'''
    train_set_x, train_set_y = dataset[0]
    valid_set_x, valid_set_y = dataset[1]
    test_set_x, test_set_y = dataset[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print('training set has %i batches' % n_train_batches)
    print('validate set has %i batches' % n_valid_batches)
    print('testing set has %i batches' % n_test_batches)

    #symbolic variables
    x = T.matrix()
    y = T.ivector()  #lvector: [long int] labels; ivector:[int] labels
    minibatch_index = T.lscalar()

    print 'build the model...'
    rng = numpy.random.RandomState(23455)

    # transfrom x from (batchsize, 28*28) to (batchsize,feature,28,28))
    # I_shape = (28,28),F_shape = (5,5),
    #第一层卷积、池化后  第一层卷积核为20个,每一个样本图片都产生20个特征图,
    N_filters_0 = 20
    D_features_0 = 1
    #输入必须是为四维的,所以需要用到reshape,这一层的输入是一批样本是20个样本,28*28,
    layer0_input = x.reshape((batch_size, D_features_0, 40, 36))
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                filter_shape=(N_filters_0, D_features_0, 5, 5),
                                image_shape=(batch_size, 1, 40, 36))
    #layer0.output: (batch_size, N_filters_0, (28-5+1)/2, (28-5+1)/2) -> 20*20*12*12
    #卷积之后得到24*24 在经过池化以后得到12*12. 最后输出的格式为20个样本,20个12*12的特征图。卷积操作是对应的窗口呈上一个卷积核参数 相加在求和得到一个特
    #征图中的像素点数  这里池化采用最大池化 减少了参数的训练。
    N_filters_1 = 50
    D_features_1 = N_filters_0
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                filter_shape=(N_filters_1, D_features_1, 5, 5),
                                image_shape=(batch_size, N_filters_0, 18, 16))
    # layer1.output: (20,50,4,4)
    #第二层输出为20个样本,每一个样本图片对应着50张4*4的特征图,其中的卷积和池化操作都是同第一层layer0是一样的。
    #这一层是将上一层的输出的样本的特征图进行一个平面化,也就是拉成一个一维向量,最后变成一个20*800的矩阵,每一行代表一个样本,

    #(20,50,4,4)->(20,(50*4*4))
    layer2_input = layer1.output.flatten(2)
    #上一层的输出变成了20*800的矩阵,通过全连接,隐层操作,将800变成了500个神经元,里面涉及到全连接。
    layer2 = HiddenLayer(rng,
                         layer2_input,
                         n_in=50 * 7 * 6,
                         n_out=500,
                         activation=T.tanh)
    #这里为逻辑回归层,主要是softmax函数作为输出,
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3)
    #约束规则

    ##########################
    cost = (layer3.negative_log_likelihood(y) + L1_reg *
            (layer2.L1_1 + layer3.L1_2) + L2_reg * (layer2.L2_1 + layer3.L2_2))

    test_model = theano.function(
        inputs=[minibatch_index],
        outputs=layer3.errors(y),
        givens={
            x:
            test_set_x[minibatch_index * batch_size:(minibatch_index + 1) *
                       batch_size],
            y:
            test_set_y[minibatch_index * batch_size:(minibatch_index + 1) *
                       batch_size]
        })

    valid_model = theano.function(
        inputs=[minibatch_index],
        outputs=layer3.errors(y),
        givens={
            x:
            valid_set_x[minibatch_index * batch_size:(minibatch_index + 1) *
                        batch_size],
            y:
            valid_set_y[minibatch_index * batch_size:(minibatch_index + 1) *
                        batch_size]
        })

    params = layer3.params + layer2.params + layer1.params + layer0.params
    gparams = T.grad(cost, params)

    updates = []
    for par, gpar in zip(params, gparams):
        updates.append((par, par - learning_rate * gpar))

    train_model = theano.function(
        inputs=[minibatch_index],
        outputs=[cost],
        updates=updates,
        givens={
            x:
            train_set_x[minibatch_index * batch_size:(minibatch_index + 1) *
                        batch_size],
            y:
            train_set_y[minibatch_index * batch_size:(minibatch_index + 1) *
                        batch_size]
        })

    #---------------------Train-----------------------#
    print 'training...'

    print('training set has %i batches' % n_train_batches)
    print('validate set has %i batches' % n_valid_batches)
    print('testing set has %i batches' % n_test_batches)

    epoch = 0
    patience = 10000
    patience_increase = 2
    validation_frequency = min(n_train_batches, patience / 2)
    improvement_threshold = 0.995

    best_parameters = None
    min_validation_error = numpy.inf
    done_looping = False

    start_time = time.clock()
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for minibatch_index in xrange(n_train_batches):

            #cur_batch_train_error,cur_params = train_model(minibatch_index)
            cur_batch_train_error = train_model(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                #validation_error = numpy.mean([valid_model(idx) for idx in xrange(n_valid_batches)])
                validation_losses = [
                    valid_model(i) for i in xrange(n_valid_batches)
                ]
                validation_error = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_error * 100.))

                if validation_error < min_validation_error:
                    if validation_error < min_validation_error * improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    min_validation_error = validation_error

                    #best_parameters = cur_params
                    best_iter = iter

                    save_params(layer0.params, layer1.params, layer2.params,
                                layer3.params)

                    test_error = numpy.mean(
                        [test_model(idx) for idx in xrange(n_test_batches)])
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_error * 100.))

            if iter >= patience:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (100 - min_validation_error * 100., best_iter + 1,
           100 - test_error * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Esempio n. 14
0
File: DBN.py Progetto: wenmm/DECRES
    def __init__(self,
                 rng,
                 n_in=784,
                 n_hidden=[500, 500],
                 n_out=10,
                 lambda_reg=0.001,
                 alpha_reg=0.001):
        """This class is made to support a variable number of layers.
    
        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                   weights
    
        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_in: int
        :param n_in: dimension of the input to the DBN
    
        :type n_hidden: list of ints
        :param n_hidden: intermediate layers size, must contain
                               at least one value

        :type n_out: int
        :param n_out: dimension of the output of the network
       
        :type lambda_reg: float
        :param lambda_reg: paramter to control the sparsity of weights by l_1 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ).
         Thus, the larger lambda_reg is, the sparser the weights are.
        
        :type alpha_reg: float
        :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ),
         Thus, the smaller alpha_reg is, the smoother the weights are.
        """

        self.hidden_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(n_hidden)

        assert self.n_layers > 0

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data, each row is a sample
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_in
            else:
                input_size = n_hidden[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.hidden_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=n_hidden[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.hidden_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=rng,
                            theano_rng=None,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=n_hidden[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        if self.n_layers > 0:
            self.logRegressionLayer = LogisticRegression(
                input=self.hidden_layers[-1].output,
                n_in=n_hidden[-1],
                n_out=n_out)
        else:
            self.logRegressionLayer = LogisticRegression(input=self.x,
                                                         n_in=input_size,
                                                         n_out=n_out)

        self.params.extend(self.logRegressionLayer.params)

        # regularization
        L1s = []
        L2_sqrs = []
        for i in range(self.n_layers):
            L1s.append(abs(self.hidden_layers[i].W).sum())
            L2_sqrs.append((self.hidden_layers[i].W**2).sum())
        L1s.append(abs(self.logRegressionLayer.W).sum())
        L2_sqrs.append((self.logRegressionLayer.W**2).sum())
        self.L1 = T.sum(L1s)
        self.L2_sqr = T.sum(L2_sqrs)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost=self.negative_log_likelihood + \
        lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr +  alpha_reg*self.L1)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred
        self.y_pred_prob = self.logRegressionLayer.y_pred_prob
def evaluate_convnet(learning_rate=0.02,
                     n_epochs=2000,
                     dataset='single_sphere',
                     nkerns=[32, 64, 64, 128],
                     batch_size=128,
                     filter_shapes=[[5, 5], [5, 5], [3, 3], [3, 3]],
                     momentum=0.9,
                     half_time=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets, max_row, max_col = load_char_data(
    )  #load_latline_dataset() # << TODO implement

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.tensor4(
        'x')  # the data is presented as spiking of sensors at lateral line
    y = T.ivector(
        'y')  # The output is the distance (in x- and y-directions) of sphere

    idxs = T.matrix('idxs')

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of sensor detections to a 4D tensor
    layer0_input = x  # x.reshape((batch_size, depth_dim, conv_dims[0], conv_dims[1]))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = conv_layer(rng,
                        input=layer0_input,
                        image_shape=(batch_size, 3, max_row, max_col),
                        filter_shape=(nkerns[0], 3, filter_shapes[0][0],
                                      filter_shapes[0][1]),
                        pooling=False,
                        activation=T.nnet.relu)

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)

    layer1 = conv_layer(rng,
                        input=layer0.output,
                        image_shape=(batch_size, nkerns[0], max_row, max_col),
                        filter_shape=(nkerns[1], nkerns[0],
                                      filter_shapes[1][0],
                                      filter_shapes[1][1]),
                        pooling=True,
                        poolsize=(2, 2),
                        activation=T.nnet.relu,
                        keepDims=True)

    layer1b = conv_layer(rng,
                         input=layer1.output,
                         image_shape=(batch_size, nkerns[1], max_row, max_col),
                         filter_shape=(nkerns[2], nkerns[1],
                                       filter_shapes[2][0],
                                       filter_shapes[2][1]),
                         pooling=False,
                         activation=T.nnet.relu,
                         keepDims=True)

    layer1c = conv_layer(rng,
                         input=layer1b.output,
                         image_shape=(batch_size, nkerns[2], max_row, max_col),
                         filter_shape=(nkerns[3], nkerns[2],
                                       filter_shapes[3][0],
                                       filter_shapes[3][1]),
                         pooling=False,
                         activation=T.nnet.relu,
                         keepDims=True)

    spp_layer = SPP(layer1c.output, idxs)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = spp_layer.output

    # construct a fully-connected ReLU layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=spp_layer.M * nkerns[-1],
                         n_out=500,
                         activation=T.nnet.relu)

    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=39)

    # linear regression by using a fully connected layer
    '''layer3 = HiddenLayer(
        rng,
        input=layer2.output,
        n_in=conv_dims[1] * 2,
        n_out=2,
        activation=None
    )
    '''

    # classify the values of the fully-connected sigmoidal layer
    #layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    demo_model = theano.function(
        [index], [layer3.y_pred, y],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params + layer1b.params + layer1c.params

    # create a list of gradients for all model parameters
    #grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    #updates = [
    #    (param_i, param_i - learning_rate * grad_i)
    #    for param_i, grad_i in zip(params, grads)
    #]

    l_r = T.scalar('l_r', dtype=theano.config.floatX)

    updates = gradient_updates_momentum(cost, params, l_r, momentum)

    train_model = theano.function(
        [index, l_r],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        if epoch % half_time == 0:
            learning_rate /= 2

        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, learning_rate)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation MSE of %f %% obtained at iteration %i, '
          'with test MSE %f ' %
          (best_validation_loss, best_iter + 1, test_score))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)

    demo_outputs = [demo_model(i) for i in range(n_test_batches)]

    sensor_range = [-1.5, 1.5]
    y_range = [0, 1]
    plt.ion()

    plotting = False

    MED = 0
    for i in range(n_test_batches):
        predicted, target = demo_outputs[i]
        for j in range(predicted.shape[0]):
            x_hat, y_hat = predicted[j]
            x, y = target[j]

            MED += numpy.sqrt((x - x_hat)**2 + (y - y_hat)**2)

            if plotting:
                plt.clf()
                plt.plot([x_hat], [y_hat], 'ro')
                plt.plot([x], [y], 'g+')
                plt.grid()
                plt.axis(
                    [sensor_range[0], sensor_range[1], y_range[0], y_range[1]])
                plt.pause(0.05)
    MED /= 2000

    print('MED = %f\n' % MED)
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Esempio n. 17
0
def classify_lenet5(
        learning_rate=0.005,
        n_epochs=8000,
        image_path='D:/dev/datasets/isbi/train-input/train-input_0000.tif',
        paramfile='lenet0_membrane_epoch_25100.pkl.gz',
        nkerns=[20, 50],
        batch_size=1):

    rng = numpy.random.RandomState(23455)

    # allocate symbolic variables for the data
    index_x = T.lscalar()  # index to a [mini]batch
    index_y = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
Esempio n. 18
0
    def __init__(self, rng, batch_size=200, nkerns=[35, 70, 35], gamma=1e-6):

        x = T.matrix()
        y = T.matrix()
        learning_rate = T.scalar()

        print '... building the model'

        self.batch_size = batch_size
        self.lowestError = 1.

        # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
        # to a 4D tensor, compatible with our LeNetConvPoolLayer
        # (28, 28) is the size of MNIST images.
        layer0_input = x.reshape((batch_size, 1, 48, 48))

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
        layer0 = LeNetConvPoolLayer(rng,
                                    input=layer0_input,
                                    image_shape=(batch_size, 1, 48, 48),
                                    filter_shape=(nkerns[0], 1, 9, 9),
                                    poolsize=(2, 2))

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        layer1 = LeNetConvPoolLayer(rng,
                                    input=layer0.output,
                                    image_shape=(batch_size, nkerns[0], 20,
                                                 20),
                                    filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                    poolsize=(2, 2))

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        layer2 = LeNetConvPoolLayer(rng,
                                    input=layer1.output,
                                    image_shape=(batch_size, nkerns[1], 8, 8),
                                    filter_shape=(nkerns[2], nkerns[1], 5, 5),
                                    poolsize=(2, 2))

        #layer2_param, layer2_out = layer2.params, layer2.output

        #img_dim = ( img_dim - dim_filter[2] + 1 )/2

        layer3_input = layer2.output.flatten(2)
        layer3 = HiddenLayer(rng,
                             input=layer3_input,
                             n_in=nkerns[2] * 2 * 2,
                             n_out=100,
                             activation=T.tanh)
        '''
        layer4 = LogisticRegression(
            input=layer3.output,
            n_in = num_hidden[0],
            n_out = num_hidden[1]
        )

        params = layer0.params + layer1.params + layer2.params + layer3.params + layer4.params         

        # using L2 regularization
        #L2_reg = sum([T.sum(i**2) for i in params if 'W' in i.name])
        
        cost = layer4.negative_log_likelihood(np.argmax(y))
        #cost += gamma * L2_reg
        '''
        layer4_input = layer3.output.flatten(2)
        layer4 = HiddenLayer(rng,
                             input=layer4_input,
                             n_in=100,
                             n_out=10,
                             activation=T.nnet.softmax)
        params = layer0.params + layer1.params + layer2.params + layer3.params + layer4.params

        # using L2 regularization
        L2_reg = sum([T.sum(i**2) for i in params if i.name == 'W'])

        cost = T.sum((y - layer4.output)**2) / y.shape[0]
        cost += gamma * L2_reg
        grads = T.grad(cost, params)

        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        self.train_model = theano.function([x, y, learning_rate],
                                           cost,
                                           updates=updates)

        self.eval_net = theano.function([x], layer4.output)

        self.params = params
Esempio n. 19
0
    def __init__(self, N_tot, D_in, D_out, M, Domain_number, Ydim,
                 Hiddenlayerdim1, Hiddenlayerdim2, num_MC):
        ########################################
        # set type
        self.Xlabel = T.matrix('Xlabel')
        self.X = T.matrix('X')
        self.Y = T.matrix('Y')
        self.Weight = T.matrix('Weight')

        Ydim = self.Y.shape[1]
        N = self.X.shape[0]
        self.Ntot = N_tot
        #############################################
        #BCなXの設定 後でこれもレイヤー化する  MCsample 分を生成することにします。

        self.hiddenLayer_x = HiddenLayer(rng=rng,
                                         input=self.X,
                                         n_in=D_in,
                                         n_out=Hiddenlayerdim1,
                                         activation=T.nnet.relu,
                                         number='_x')
        self.hiddenLayer_hidden = HiddenLayer(rng=rng,
                                              input=self.hiddenLayer_x.output,
                                              n_in=Hiddenlayerdim1,
                                              n_out=Hiddenlayerdim2,
                                              activation=T.nnet.relu,
                                              number='_h')
        self.hiddenLayer_m = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=D_out,
                                         activation=T.nnet.relu,
                                         number='_m')
        self.hiddenLayer_S = HiddenLayer(rng=rng,
                                         input=self.hiddenLayer_hidden.output,
                                         n_in=Hiddenlayerdim2,
                                         n_out=D_out,
                                         activation=T.nnet.relu,
                                         number='_S')

        self.loc_params = []
        self.loc_params.extend(self.hiddenLayer_x.params)
        self.loc_params.extend(self.hiddenLayer_hidden.params)
        self.loc_params.extend(self.hiddenLayer_m.params)
        self.loc_params.extend(self.hiddenLayer_S.params)

        self.local_params = {}
        for i in self.loc_params:
            self.local_params[str(i)] = i

        #when we use the back constrained model....
        srng = RandomStreams(seed=234)
        sample_latent_epsilon = srng.normal((num_MC, N, D_out))
        latent_samples = sample_latent_epsilon * (
            T.exp(self.hiddenLayer_S.output)**
            0.5)[None, :, :] + self.hiddenLayer_m.output[None, :, :]

        #普通のsupervised な場合 MCサンプル分コピーしときます。
        #self.Data_input=T.tile(self.X,(num_MC,1,1))
        self.Data_input = latent_samples
        ##########################################
        ####X側の推論
        #self.Gaussian_layer_X=KernelLayer(self.Data_input, D_in=D_out, D_out=D_in,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_X')

        self.Gaussian_layer_X = KernelLayer(self.Data_input,
                                            D_in=D_out,
                                            D_out=D_in,
                                            num_MC=num_MC,
                                            inducing_number=M,
                                            Domain_number=Domain_number,
                                            Domain_consideration=True,
                                            number='_X')

        self.params = self.Gaussian_layer_X.params
        self.Z_params_list = self.Gaussian_layer_X.Z_params_list
        self.global_param_list = self.Gaussian_layer_X.global_params_list
        self.hyp_list = self.Gaussian_layer_X.hyp_params_list

        self.hidden_layer = self.Gaussian_layer_X.output

        ##############################################################################################
        ###Y側の計算
        #self.Gaussian_layer_Y=KernelLayer(self.hidden_layer,D_in=D_out,D_out=Ydim,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_Y')

        #self.params.extend(self.Gaussian_layer_Y.params)
        #self.Z_params_list.extend(self.Gaussian_layer_Y.Z_params_list)
        #self.global_param_list.extend(self.Gaussian_layer_Y.global_params_list)
        #self.hyp_list.extend(self.Gaussian_layer_Y.hyp_params_list)

        ###########################################
        ###目的関数

        #self.LL = self.Gaussian_layer_X.liklihood_nodomain(self.X)*N_tot/(N)
        self.LL = self.Gaussian_layer_X.likelihood_domain(
            self.X, self.Xlabel) * N_tot / (N)
        self.KL_U = self.Gaussian_layer_X.KL_U
        #self.KL_UY=self.Gaussian_layer_Y.KL_U
        #y=self.Gaussian_layer_Y.softmax_class()
        #self.LLY = -T.mean(T.nnet.categorical_crossentropy(y, self.Y))*N
        #self.LLY=T.sum(T.log(T.maximum(T.sum(self.Y * y, 1), 1e-16)))
        #self.error = self.Gaussian_layer_Y.error_classification(self.Y)

        self.KL_latent_dim = self.KLD_X(
            self.hiddenLayer_m.output, T.exp(
                self.hiddenLayer_S.output)) * N_tot / (N)

        #pred = T.mean(self.Gaussian_layer_X.output,0)
        #self.error = (T.mean((self.Y - pred)**2,0))**0.5

        ###########################################
        #domain checker MMD と クラス分類
        #self.MMD=self.Gaussian_layer_Y.MMD_class_penalty(self.Y,self.Xlabel)

        ##########################################
        #パラメータの格納
        self.hyp_params = {}
        for i in self.hyp_list:
            self.hyp_params[str(i)] = i

        self.Z_params = {}
        for i in self.Z_params_list:
            self.Z_params[str(i)] = i

        self.global_params = {}
        for i in self.global_param_list:
            self.global_params[str(i)] = i

        self.params.extend(self.loc_params)

        self.wrt = {}
        for i in self.params:
            self.wrt[str(i)] = i
Esempio n. 20
0
    def __init__(self,
                 rng,
                 batch_size=100,
                 input_size=None,
                 nkerns=[4, 4, 4],
                 receptive_fields=((2, 8), (2, 8), (2, 8)),
                 poolsizes=((1, 8), (1, 8), (1, 4)),
                 full_hidden=16,
                 n_out=10):
        """
        
        """
        self.x = T.matrix(name='x', dtype=theano.config.floatX
                          )  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        self.batch_size = theano.shared(
            value=batch_size, name='batch_size')  #T.lscalar('batch_size')

        self.layers = []
        self.params = []
        for i in range(len(nkerns)):
            receptive_field = receptive_fields[i]
            if i == 0:
                featmap_size_after_downsample = input_size
                layeri_input = self.x.reshape(
                    (batch_size, 1, featmap_size_after_downsample[0],
                     featmap_size_after_downsample[1]))
                image_shape = (batch_size, 1, featmap_size_after_downsample[0],
                               featmap_size_after_downsample[1])
                filter_shape = (nkerns[i], 1, receptive_field[0],
                                receptive_field[1])
            else:
                layeri_input = self.layers[i - 1].output
                image_shape = (batch_size, nkerns[i - 1],
                               featmap_size_after_downsample[0],
                               featmap_size_after_downsample[1])
                filter_shape = (nkerns[i], nkerns[i - 1], receptive_field[0],
                                receptive_field[1])

            layeri = LeNetConvPoolLayer(rng=rng,
                                        input=layeri_input,
                                        image_shape=image_shape,
                                        filter_shape=filter_shape,
                                        poolsize=poolsizes[i])
            featmap_size_after_conv = get_featmap_size_after_conv(
                featmap_size_after_downsample, receptive_fields[i])
            featmap_size_after_downsample = get_featmap_size_after_downsample(
                featmap_size_after_conv, poolsizes[i])
            self.layers.append(layeri)
            self.params.extend(layeri.params)

        # fully connected layer
        print 'going to fully connected layer'
        layer_full_input = self.layers[-1].output.flatten(2)

        # construct a fully-connected sigmoidal layer
        layer_full = HiddenLayer(rng=rng,
                                 input=layer_full_input,
                                 n_in=nkerns[-1] *
                                 featmap_size_after_downsample[0] *
                                 featmap_size_after_downsample[1],
                                 n_out=full_hidden,
                                 activation=T.tanh)
        self.layers.append(layer_full)
        self.params.extend(layer_full.params)

        # classify the values of the fully-connected sigmoidal layer
        print 'going to output layer'
        self.logRegressionLayer = LogisticRegression(
            input=self.layers[-1].output, n_in=full_hidden, n_out=n_out)
        self.params.extend(self.logRegressionLayer.params)

        # the cost we minimize during training is the NLL of the model
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost = self.logRegressionLayer.negative_log_likelihood(self.y)
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred
Esempio n. 21
0
def evaluate_lenet5(learning_rate=0.10,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[16, 16, 16, 12, 12, 12],
                    batch_size=500):

    rng = numpy.random.RandomState(32324)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    index = T.lscalar()  # index for each mini batch
    train_epoch = T.lscalar('train_epoch')

    x = T.matrix('x')
    y = T.ivector('y')

    # ------------------------------- Building Model ----------------------------------
    print "...Building the model"

    layer_0_input = x.reshape((batch_size, 1, 28, 28))

    # output image size = (28-5+1+)/1 = 24
    layer_0 = LeNetConvPoolLayer(rng,
                                 input=layer_0_input,
                                 image_shape=(batch_size, 1, 28, 28),
                                 filter_shape=(nkerns[0], 1, 5, 5),
                                 poolsize=(1, 1))

    #output image size = (24-3+1) = 22
    layer_1 = LeNetConvPoolLayer(rng,
                                 input=layer_0.output,
                                 image_shape=(batch_size, nkerns[0], 24, 24),
                                 filter_shape=(nkerns[1], nkerns[0], 3, 3),
                                 poolsize=(1, 1))

    #output image size = (22-3+1)/2 = 10
    layer_2 = LeNetConvPoolLayer(rng,
                                 input=layer_1.output,
                                 image_shape=(batch_size, nkerns[1], 22, 22),
                                 filter_shape=(nkerns[2], nkerns[1], 3, 3),
                                 poolsize=(2, 2))

    #output image size = (10-3+1)/2 = 4
    layer_3 = LeNetConvPoolLayer(rng,
                                 input=layer_2.output,
                                 image_shape=(batch_size, nkerns[2], 10, 10),
                                 filter_shape=(nkerns[3], nkerns[2], 3, 3),
                                 poolsize=(2, 2))

    #output image size = (4-3+2+1) = 4
    layer_4 = LeNetConvPoolLayer(rng,
                                 input=layer_3.output,
                                 image_shape=(batch_size, nkerns[3], 4, 4),
                                 filter_shape=(nkerns[4], nkerns[3], 3, 3),
                                 poolsize=(1, 1),
                                 border_mode=1)

    #output image size = (4-3+1)/2 = 2
    layer_5 = LeNetConvPoolLayer(rng,
                                 input=layer_4.output,
                                 image_shape=(batch_size, nkerns[4], 4, 4),
                                 filter_shape=(nkerns[5], nkerns[4], 3, 3),
                                 poolsize=(2, 2),
                                 border_mode=1)

    # make the input to hidden layer 2 dimensional
    layer_6_input = layer_5.output.flatten(2)

    layer_6 = HiddenLayer(rng,
                          input=layer_6_input,
                          n_in=nkerns[5] * 2 * 2,
                          n_out=200,
                          activation=T.tanh)

    layer_7 = LogReg(input=layer_6.output, n_in=200, n_out=10)

    teacher_p_y_given_x = theano.shared(numpy.asarray(
        pickle.load(open('prob_best_model.pkl', 'rb')),
        dtype=theano.config.floatX),
                                        borrow=True)
    p_y_given_x = T.matrix('p_y_given_x')
    e = theano.shared(value=0, name='e', borrow=True)

    cost = layer_7.neg_log_likelihood(
        y) + 2.0 / (e) * T.mean(-T.log(layer_7.p_y_given_x) * p_y_given_x -
                                layer_7.p_y_given_x * T.log(p_y_given_x))

    tg = theano.shared(numpy.asarray(pickle.load(
        open('modified_guided_data.pkl', 'rb')),
                                     dtype=theano.config.floatX),
                       borrow=True)
    guiding_weights = T.tensor4('guiding_weights')
    #guide_cost = T.mean(-T.log(layer_3.output)*guiding_weights - layer_3.output*T.log(guiding_weights))
    guide_cost = T.mean((layer_3.output - guiding_weights)**2)
    test_model = theano.function(
        [index],
        layer_7.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer_7.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # list of parameters

    params = layer_7.params + layer_6.params + layer_5.params + layer_4.params + layer_3.params + layer_2.params + layer_1.params + layer_0.params
    params_gl = layer_3.params + layer_2.params + layer_1.params + layer_0.params
    # import pdb
    # pdb.set_trace()
    grads_gl = T.grad(guide_cost, params_gl)
    updates_gl = [(param_i, param_i - learning_rate * grad_i)
                  for param_i, grad_i in zip(params_gl, grads_gl)]

    grads = T.grad(cost, params)
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index, train_epoch],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            p_y_given_x: teacher_p_y_given_x[index],
            e: train_epoch
        })
    train_till_guided_layer = theano.function(
        [index],
        guide_cost,
        updates=updates_gl,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            guiding_weights: tg[index]
        },
        on_unused_input='ignore')

    # -----------------------------------------Starting Training ------------------------------

    print('..... Training ')

    # for early stopping
    patience = 10000
    patience_increase = 2

    improvement_threshold = 0.95

    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = numpy.inf  # initialising loss to be inifinite
    best_itr = 0
    test_score = 0

    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            if epoch < n_epochs / 5:
                cost_ij_guided = train_till_guided_layer(minibatch_index)
            cost_ij = train_model(minibatch_index, epoch)

            if (iter + 1) % validation_frequency == 0:
                # compute loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                # import pdb
                # pdb.set_trace()

                with open('Student_6_terminal_out_2', 'a+') as f_:
                    f_.write(
                        'epoch %i, minibatch %i/%i, validation error %f %% \n'
                        % (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))

                # check with best validation score till now
                if this_validation_loss < best_validation_loss:

                    # improve
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_itr = iter

                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    with open('Student_6_terminal_out_2', 'a+') as f_:
                        f_.write(
                            'epoch %i, minibatch %i/%i, testing error %f %%\n'
                            % (epoch, minibatch_index + 1, n_train_batches,
                               test_score * 100.))
                    with open('best_model_7layer_2.pkl', 'wb') as f:
                        pickle.dump(params, f)
                    with open('Results_student_6_2.txt', 'wb') as f1:
                        f1.write(str(test_score * 100) + '\n')
            #if patience <= iter:
            #	done_looping = True
            #	break

    end_time = timeit.default_timer()
    with open('Student_6_terminal_out_2', 'a+') as f_:
        f_.write('Optimization complete\n')
        f_.write(
            'Best validation score of %f %% obtained at iteration %i with test performance %f %% \n'
            % (best_validation_loss * 100., best_itr, test_score * 100))
        f_.write('The code ran for %.2fm\n' % ((end_time - start_time) / 60.))
Esempio n. 22
0
#    image_shape=(batch_size, nkerns[0], 9, 9),
#    filter_shape=(nkerns[1], nkerns[0], 4, 4),
#    poolsize=(2, 2)
# )

# the HiddenLayer being fully-connected, it operates on 2D matrices of
# shape (batch_size, num_pixels) (i.e matrix of rasterized images).
# This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
# or (500, 50 * 4 * 4) = (500, 800) with the default values.
layer2_input = layer0.output.flatten(2)

# construct a fully-connected sigmoidal layer
layer2 = HiddenLayer(
    rng,
    input=layer2_input,
    n_in=nkerns[0] * 2 * 2,
    n_out=50,
    activation=T.tanh
)

# classify the values of the fully-connected sigmoidal layer
layer3 = LogisticRegression(input=layer2.output, n_in=50, n_out=10)

# the cost we minimize during training is the NLL of the model
cost = layer3.negative_log_likelihood(y)

# create a function to compute the mistakes that are made by the model
test_model = theano.function(
    [index],
    layer3.errors(y),
    givens={
Esempio n. 23
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Esempio n. 24
0
# filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
# maxpooling reduces this further to (8/2,8/2) = (4,4)
# 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
        image_shape=(batch_size, nkerns[0], layer1_input_img_size[0], layer1_input_img_size[1]),
        filter_shape=(nkerns[1], nkerns[0], filter1_shape[0], filter1_shape[1]), poolsize=(2, 2) \
        )

# the TanhLayer being fully-connected, it operates on 2D matrices of
# shape (batch_size,num_pixels) (i.e matrix of rasterized images).
# This will generate a matrix of shape (20,32*4*4) = (20,512)
layer2_input = layer1.output.flatten(2)

# construct a fully-connected sigmoidal layer
layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * layer2_input_img_size[0] * layer2_input_img_size[1],
                     n_out=layer2_out, activation=T.tanh \
                     )

# classify the values of the fully-connected sigmoidal layer
layer3 = LogisticRegression(input=layer2.output, n_in=layer2_out, n_out=N_OUT\
                            )


def load_trained_model():
    global train_model_route
    global layer0_input
    global layer0
    global layer1
    global layer2_input
    global layer2
    global layer3
Esempio n. 25
0
    def __init__(self,
                 PV,
                 numpy_rng,
                 kind=2,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 h_activation=[],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """
        self.PV = theano.shared(value=PV, borrow=True)
        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels
        self.z1 = T.matrix('z1')
        self.z2 = T.matrix('z2')
        # end-snippet-1

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        # start-snippet-2
        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            activation = None
            if h_activation[i] == 1:
                activation = T.nnet.sigmoid
            if h_activation[i] == 2:
                activation = T.tanh
            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=activation)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        # end-snippet-2
        # We now need to add a output layer on top of the MLP
        self.OutLayer = HiddenLayer(rng=numpy_rng,
                                    input=self.sigmoid_layers[-1].output,
                                    n_in=hidden_layers_sizes[-1],
                                    n_out=n_outs,
                                    activation=T.nnet.sigmoid,
                                    kind=kind)
        self.params.extend(self.OutLayer.params)

        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.OutLayer.sq_loss(self.z1, self.z2)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.OutLayer.errors(self.y)
        self.p_y_given_x = self.OutLayer.output
Esempio n. 26
0
def load_trained_model():
    global train_model_route
    global layer0_input
    global layer0
    global layer1
    global layer2_input
    global layer2
    global layer3

    global layer0_input_img_size  # ishape
    global filter0_shape
    global layer1_input_img_size
    global filter1_shape
    global layer2_input_img_size

    print "loading trained model"
    trained_model_pkl = open(train_model_route, 'r')
    trained_model_state_list = cPickle.load(trained_model_pkl)
    trained_model_state_array = numpy.load(trained_model_pkl)
    layer0_state, layer1_state, layer2_state, layer3_state = trained_model_state_array

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... loading the model'

    # Reshape matrix of rasterized images of shape (1, 50*50)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape(
        (batch_size, 1, layer0_input_img_size[0], layer0_input_img_size[1]))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input, \
            image_shape=(batch_size, 1, layer0_input_img_size[0], layer0_input_img_size[1]), \
            filter_shape=(nkerns[0], 1, filter0_shape[0], filter0_shape[1]), poolsize=(2, 2), \
                W=layer0_state[0], b=layer0_state[1] \
            )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], layer1_input_img_size[0], layer1_input_img_size[1]),
            filter_shape=(nkerns[1], nkerns[0], filter1_shape[0], filter1_shape[1]), poolsize=(2, 2), \
            W=layer1_state[0], b=layer1_state[1] \
            )

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * layer2_input_img_size[0] * layer2_input_img_size[1],
                         n_out=layer2_out, activation=T.tanh, \
                         W=layer2_state[0], b=layer2_state[1] \
                         )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=layer2_out, n_out=N_OUT, \
                                    W=layer3_state[0], b=layer3_state[1] \
                                )
Esempio n. 27
0
    def __init__(self, numpy_rng=None, theano_rng=None, n_ins=784,
                 gauss=True,
                 hidden_layers_sizes=[400], n_outs=40,
                 W_list=None, b_list=None):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type gauss: bool
        :param gauss: True if the first layer is Gaussian otherwise
                      the first layer is Bernoullian

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type W_list: list of numpy.ndarray
        :param W_list: the list of weigths matrixes for each layer of the MLP; if
                       None each matrix is randomly initialized

        :type b_list: list of numpy.ndarray
        :param b_list: the list of biases vectors for each layer of the MLP; if
                       None each vector is randomly initialized
        """

        self.n_ins = n_ins
        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.stacked_layers_sizes = hidden_layers_sizes + [n_outs]
        self.n_layers = len(self.stacked_layers_sizes)

        assert self.n_layers > 0

        if numpy_rng is None:
            numpy_rng = numpy.random.RandomState(123)

        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data

        # the data is presented as rasterized images
        self.x = tensor.matrix('x')

        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well).

        for i in range(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = self.stacked_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            n_in = input_size
            n_out= self.stacked_layers_sizes[i]

            print('Adding a layer with %i input and %i outputs' %
                  (n_in, n_out))

            if W_list is None:
                W = numpy.asarray(numpy_rng.uniform(
                                low=-4.*numpy.sqrt(6. / (n_in + n_out)),
                                high=4.*numpy.sqrt(6. / (n_in + n_out)),
                                size=(n_in, n_out)
                             ),dtype=theano.config.floatX)
            else:
                W = W_list[i]

            if b_list is None:
                b = numpy.zeros((n_out,), dtype=theano.config.floatX)
            else:
                b = b_list[i]

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=n_in,
                                        n_out=n_out,
                                        W=theano.shared(W,name='W',borrow=True),
                                        b=theano.shared(b,name='b',borrow=True),
                                        activation=tensor.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            if i==0 and gauss:
                rbm_layer = GRBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=self.stacked_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            else:
                rbm_layer = RBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=self.stacked_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)

            self.rbm_layers.append(rbm_layer)
Esempio n. 28
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.095,
                    n_epochs=2000,
                    nkerns=[20, 50],
                    batch_size=110):
    """ Demonstrates lenet on dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 50, 50))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 50, 50),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 23, 23),
                                filter_shape=(nkerns[1], nkerns[0], 11, 11),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 6 * 6,
                         n_out=110,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=110, n_out=8)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 1393  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print(
        'Best validation error of %f %% obtained at iteration %i, '
        'with test error %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Esempio n. 30
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_onlyMT_BBN_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_only_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))