Beispiel #1
0
def main3():
    rng = numpy.random.RandomState(23455)
 
    docList = ttList.TypedListType(ttList.TypedListType(TensorType(theano.config.floatX, (False, False))))("docList")
    docLabel = T.ivector('docLabel') 
    layer0 = DocEmbeddingNN(docList, rng, 4)
    
    layer1 = HiddenLayer(
        rng,
        input=layer0.output,
        n_in=layer0.outputDimension,
        n_out=10,
        activation=T.tanh
    )
    
    layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=10)
    cost = layer2.negative_log_likelihood(docLabel)
    params = layer2.params + layer1.params + layer0.params
    grads = T.grad(cost, params)
    
    f = theano.function([docList], layer2.y_pred)
    
    
    a = [
            [
              [[2, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
              [[1, 2, 4, 4], [1, 2, 3, 4]]
             ],
          [
              [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
              [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
             ]
        ]
    print f(a)
    print "All finished!"
Beispiel #2
0
def sgd_predict(dataset=DataHome, batch_size=28):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """

    logistic_regression_model_pkl = open(train_model_route, "r")
    logistic_regression_model_state = cPickle.load(logistic_regression_model_pkl)
    W, b = logistic_regression_model_state

    datasets = load_data.load_data(dataset)

    test_set_x, test_set_y = datasets[2]

    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    # print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10, W=W, b=b)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch

    test_results = theano.function(
        inputs=[index], outputs=classifier.y_pred, givens={x: test_set_x[index * batch_size : (index + 1) * batch_size]}
    )

    test_res = [test_results(i) for i in xrange(n_test_batches)]
    print test_res
Beispiel #3
0
class MLP(object):

	def __init__(self, rng, input, n_in, n_hidden, n_out):

		# hidden layer, defined in HiddenLayer.py
		self.hiddenLayer = HiddenLayer(rng = rng, input = input,
		n_in = n_in, n_out = n_hidden, activation = T.tanh)
	
		# output layer, logistic regression
		self.logRegressionLayer = LogisticRegression(input = self.hiddenLayer.output, 
		n_in = n_hidden, n_out = n_out)

		# Regularization of params
		# option 1: L1 regularization of params
		self.L1 = abs(self.hiddenLayer.W).sum() \
			+ abs(self.logRegressionLayer.W).sum()

		self.L2_sqr = (self.hiddenLayer.W **2).sum() \
				+ (self.logRegressionLayer.W **2).sum()

		# Define the log likelihood, errors based on component models

		self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood 

		self.errors = self.logRegressionLayer.errors

		self.params = self.hiddenLayer.params + \
				self.logRegressionLayer.params
		
	def __getstate__(self):
		""" Return the hidden layer and logistic regression layer that make up the MLP. """
		return (self.hiddenLayer,self.logRegressionLayer)
	
	def __setstate__(self, state):
		""" Re-establish the hidden layer and logistic regression layer objects. """
		(hiddenLayer,logRegressionLayer) = state
		self.hiddenLayer = hiddenLayer
		self.logRegressionLayer = logRegressionLayer
	
	def reconstruct_state(self,input, activation):
		""" Re-establish the inputs for each layer of the MLP. """
		self.hiddenLayer.reconstruct_state(input,activation)
		self.logRegressionLayer.reconstruct_state(self.hiddenLayer.output)
		self.L1 = abs(self.hiddenLayer.W).sum() \
				        + abs(self.logRegressionLayer.W).sum()
		
		self.L2_sqr = (self.hiddenLayer.W **2).sum() \
	                        + (self.logRegressionLayer.W **2).sum()

		# Define the log likelihood, errors based on component models

		self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood 

		self.errors = self.logRegressionLayer.errors

		self.params = self.hiddenLayer.params + \
	                        self.logRegressionLayer.params		
Beispiel #4
0
    def build_network(self,nkerns=[20, 50], batch_size=10):

        rng = numpy.random.RandomState(23455)
        layer0_input = self.x.reshape((batch_size, 1, 128, 128))

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
        layer0 = LeNetConvPoolLayer(
            rng,
            input=layer0_input,
            image_shape=(batch_size, 1, 128, 128),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2)
        )

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
        # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
        layer1 = LeNetConvPoolLayer(
            rng,
            input=layer0.output,
            image_shape=(batch_size, nkerns[0], 62, 62),
            filter_shape=(nkerns[1], nkerns[0], 5, 5),
            poolsize=(2, 2)
        )

        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
        # or (500, 50 * 4 * 4) = (500, 800) with the default values.
        layer2_input = layer1.output.flatten(2)

        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(
            rng,
            input=layer2_input,
            n_in=nkerns[1] * 29 * 29,
            n_out=500,
            activation=T.tanh
        )

        # classify the values of the fully-connected sigmoidal layer
        layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=23)
        self.cost = layer3.negative_log_likelihood(self.y)
        self.out_layer=layer3
        '''batches = numpy.zeros((10, 128*128),dtype='float32')
        label=numpy.zeros((10))
        print layer2.output.shape.eval({self.x:batches})'''

        self.feature_layer=layer2
        self.params_except_3=layer2.params + layer1.params + layer0.params
        self.params = layer3.params + layer2.params + layer1.params + layer0.params
                # create a list of gradients for all model parameters
        self.grads = T.grad(self.cost, self.params)
def classify_lenet5(learning_rate=0.005, n_epochs=8000,
                    image_path='D:/dev/datasets/isbi/train-input/train-input_0000.tif',
                    paramfile='lenet0_membrane_epoch_25100.pkl.gz',
                    nkerns=[20, 50], batch_size=1):

    rng = numpy.random.RandomState(23455)

    # allocate symbolic variables for the data
    index_x = T.lscalar()  # index to a [mini]batch
    index_y = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], 12, 12),
            filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4,
                         n_out=500, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
Beispiel #6
0
def build_model_example(batch_size, learning_rate, rng, x, y):
    nkerns = [64]
    print '... building the model'
    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = _dropout_from_layer(rng, x.reshape((batch_size, 7, 4, 7, 7)), p=0.2)
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPool3dLayer(rng, input=layer0_input,
                                image_shape=(batch_size, 7, 4, 7, 7),
                                filter_shape=(nkerns[0], 5, 4, 5, 5), )
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    # layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
    #         image_shape=(batch_size, nkerns[0], 12, 12),
    #         filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer1_input = layer0.output.flatten(2)
    # layer1 = HiddenLayer(rng, input=x, n_in=4*7*7*7,
    layer1 = HiddenLayer(rng, input=layer1_input, n_in=nkerns[-1] * 3 ** 3,
                         n_out=1000,
                         activation=relu, )
    # p=0.5)
    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer1.output, n_in=1000,
                         n_out=1000,
                         activation=relu)
    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=1000, n_out=2)
    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params
    # params = layer3.params + layer0.params + layer1.params
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))
    return cost, layer3, updates
def compileFun(model_name, dataset_name, pooling_mode):
	print "model_name: ", model_name
	print "dataset_name: ", dataset_name
	print "pooling_mode: ", pooling_mode
	print "Started!"
	rng = numpy.random.RandomState(23455)
	sentenceWordCount = T.ivector("sentenceWordCount")
	corpus = T.matrix("corpus")
# 	docLabel = T.ivector('docLabel') 
	
	# for list-type data
	layer0 = DocEmbeddingNNOneDoc(corpus, sentenceWordCount, rng, wordEmbeddingDim=200, \
													 sentenceLayerNodesNum=100, \
													 sentenceLayerNodesSize=[5, 200], \
													 docLayerNodesNum=100, \
													 docLayerNodesSize=[3, 100],
													 pooling_mode=pooling_mode)

	layer1_output_num = 100
	layer1 = HiddenLayer(
		rng,
		input=layer0.output,
		n_in=layer0.outputDimension,
		n_out=layer1_output_num,
		activation=T.tanh
	)
	
	layer2 = LogisticRegression(input=layer1.output, n_in=100, n_out=2)

	cost = layer2.negative_log_likelihood(1 - layer2.y_pred)
		
	# calculate sentence sentence_score
	sentence_grads = T.grad(cost, layer0.sentenceResults)
	sentence_score = T.diag(T.dot(sentence_grads, T.transpose(layer0.sentenceResults)))
	
	# construct the parameter array.
	params = layer2.params + layer1.params + layer0.params
	
	# Load the parameters last time, optionally.
	model_path = "data/" + dataset_name + "/" + model_name + "/" + pooling_mode + ".model"
	loadParamsVal(model_path, params)
	print "Compiling computing graph."
	output_model = theano.function(
 		[corpus, sentenceWordCount],
 		[layer2.y_pred, sentence_score]
 	)
	
	print "Compiled."
	return output_model
Beispiel #8
0
    def depickle(self,n_in = 48* 48,hidden_layers_sizes=[1000,1000,1000]):
        numpy_rng=numpy.random.RandomState(123)
        self.sigmoid_layers = []
        self.rbm_layers = []
        last_later = self.x
        cur_index= 0
        theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))
        for param in self.params:
            print param.get_value().shape
        for index in range(self.n_layers):
            if index == 0:
                input_size= n_in
                layer_input = self.x
            else:
                input_size = hidden_layers_sizes[index- 1]
                layer_input =self.sigmoid_layers[-1].output

            simple_sigmoid=HiddenLayer(1, input = layer_input,n_in = input_size,n_out=1000,W = self.params[cur_index], b = self.params[cur_index+1],activation=T.nnet.sigmoid)
            print self.params[cur_index].get_value().shape
            print self.params[cur_index+1].get_value().shape
            cur_index+= 2
            self.sigmoid_layers.append(simple_sigmoid)

            rbm_layer = RBM(numpy_rng=numpy_rng,input=layer_input,
                    W=simple_sigmoid.W,
                    n_visible=input_size,
                    n_hidden=hidden_layers_sizes[index],
                    hbias=simple_sigmoid.b)
            self.rbm_layers.append(rbm_layer)
            
        self.logLayer = LogisticRegression(input = self.sigmoid_layers[-1].output,n_in=hidden_layers_sizes[-1], n_out=7)
        self.logLayer.W = self.params[cur_index]
        self.logLayer.b = self.params[cur_index+1]

        self.y_pred = self.logLayer.p_y_given_x
Beispiel #9
0
	def __init__(self, rng, input, n_in, n_hidden, n_out):

		# hidden layer, defined in HiddenLayer.py
		self.hiddenLayer = HiddenLayer(rng = rng, input = input,
		n_in = n_in, n_out = n_hidden, activation = T.tanh)
	
		# output layer, logistic regression
		self.logRegressionLayer = LogisticRegression(input = self.hiddenLayer.output, 
		n_in = n_hidden, n_out = n_out)

		# Regularization of params
		# option 1: L1 regularization of params
		self.L1 = abs(self.hiddenLayer.W).sum() \
			+ abs(self.logRegressionLayer.W).sum()

		self.L2_sqr = (self.hiddenLayer.W **2).sum() \
				+ (self.logRegressionLayer.W **2).sum()

		# Define the log likelihood, errors based on component models

		self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood 

		self.errors = self.logRegressionLayer.errors

		self.params = self.hiddenLayer.params + \
				self.logRegressionLayer.params
Beispiel #10
0
    def __init__(self, rng, input, n_in, n_hidden, n_out,discriminant_threshold):
        self.hiddenLayer = HiddenLayer(
            rng=rng,
            input=input,
            n_in=n_in,
            n_out=n_hidden,
            activation=T.tanh
        )

        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output,
            n_in=n_hidden,
            n_out=n_out,
            discriminant_threshold = discriminant_threshold
        )

        self.L1 = (
            abs(self.hiddenLayer.W).sum()
            + abs(self.logRegressionLayer.W).sum()
        )
        self.L2_sqr = (
            (self.hiddenLayer.W ** 2).sum()
            + (self.logRegressionLayer.W ** 2).sum()
        )
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood
        )
        self.errors = self.logRegressionLayer.errors
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        self.layers = [self.hiddenLayer, self.logRegressionLayer] 
        self.ypred = self.logRegressionLayer.y_pred
        self.py_given_x = self.logRegressionLayer.p_y_given_x
Beispiel #11
0
    def __init__(self,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 numpy_rng=None,
                 theano_rng=None):
        
        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        if numpy_rng is None:
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.matrix('x')  
        self.y = T.ivector('y')  

        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            self.sigmoid_layers.append(sigmoid_layer)

            self.params.extend(sigmoid_layer.params)

            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        self.errors = self.logLayer.errors(self.y)
Beispiel #12
0
    def __init__(self, numpy_rng, theano_rng=None,    n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10,    corruption_levels=[0.1, 0.1]):
        """ numpy_rng: numpy.random.RandomState: numpy random number generator used to draw initial weights
            theano_rng: theano.tensor.shared_randomstreams.RandomStreams: Theano random generator; if None is given one is generated based on a seed drawn from `rng`
            n_ins: int: dimension of the input to the sDAE
            n_layers_sizes: list of ints: intermediate layers size, must contain at least one value
            n_outs: int: dimension of the output of the network
            corruption_levels: list of float: amount of corruption to use for each layer
        """
        self.sigmoid_layers = []
        self.DAE_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
       
        # Symbolic variables for data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

        # The SDAE is an MLP, for which all weights of intermediate layers are shared with a different denoising autoencoders. We will first construct the 
        # SDAE as a deep multilayer perceptron, and when constructing each sigmoidal layer we also construct a denoising autoencoder that shares weights with that layer
        #     - During pretraining we will train these autoencoders (which will lead to changing the weights of the MLP as well)
        #     - During finetunining we will finish training the SDAE by doing stochastic gradient descent on the MLP

        # Build an MLP and a DAE in parallel to each other  with sharing weights. layer by layer
        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output_data

            # hidden layer (logistic)
            sigmoid_layer = HiddenLayer(rng=numpy_rng, 
                                        input_data=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            self.sigmoid_layers.append(sigmoid_layer)    # add the layer to our list of layers
            self.params.extend(sigmoid_layer.params)
            # We are going to only declare that the parameters of the sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the DAE are parameters of those DAE, but not the SDAE

            # DAE layer with shared weights with MLP layer
            DAE_layer = DAE(numpy_rng=numpy_rng, theano_rng=theano_rng,
                          input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W, bhid=sigmoid_layer.b)  # shared weights and biases
            self.DAE_layers.append(DAE_layer)

        # Finally, Add a logistic layer at the end of the MLP
        self.logisticLayer = LogisticRegression(input_data=self.sigmoid_layers[-1].output_data, n_in=hidden_layers_sizes[-1], n_out=n_outs)
        self.params.extend(self.logisticLayer.params)

        self.finetune_cost = self.logisticLayer.negative_log_likelihood(self.y)
        self.errors = self.logisticLayer.errors(self.y)
Beispiel #13
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1]):

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.matrix('x')
        self.y = T.ivector('y')

        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
                layer_input = self.x
            else:
                input_size = hidden_layers_sizes[i - 1]
                layer_input = self.sigmoid_layers[i - 1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)

            self.dA_layers.append(dA_layer)


        self.logLayer = LogisticRegression(
                        input=self.sigmoid_layers[-1].output,
                        n_in=hidden_layers_sizes[-1],
                        n_out=n_outs)

        self.params.extend(self.logLayer.params)

        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        self.errors = self.logLayer.errors(self.y)
  def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
      hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]):
    self.sigmoid_layers = []
    self.dA_layers = []
    self.params = []
    self.n_layers = len(hidden_layers_sizes)

    assert self.n_layers > 0

    if not theano_rng:
      theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
    #symbolic variables for data
    self.x = T.matrix('x')
    self.y = T.ivector('y')

    for i in xrange(self.n_layers):
      if i == 0:
        input_size = n_ins
        layer_input = self.x
      else:
        input_size = hidden_layers_sizes[i-1]
        layer_input = self.sigmoid_layers[-1].output

      sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size,
          n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid)
      
      #add to our list of layers
      self.sigmoid_layers.append(sigmoid_layer)

      #declare thar parameters of sigmoid layers are parameters of StackedDAA
      #the visible biases in the dA are parameters of those dA but not the SdA
      self.params.extend(sigmoid_layer.params)

      #construct denoising autoencoder that shared weights with this layer
      dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, 
          n_visible=input_size,
          n_hidden=hidden_layers_sizes[i],
          W=sigmoid_layer.W,
          bhid=sigmoid_layer.b)
      self.dA_layers.append(dA_layer)

    #add logistic layer on top of MLP
    self.logLayer = LogisticRegression(input=self.sigmoid_layers[-1].output,
        n_in=hidden_layers_sizes[-1], n_out=n_outs)
    self.params.extend(self.logLayer.params)

    #function that implements fine tuning

    #compute cost for second phase of training, defined as negative log likelihood
    self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
    
    #compute gradients w.r.t. model parameters
    #symbolic variable for no. of errors made on minibatch given by self.x and
    #self.y
    self.errors = self.logLayer.errors(self.y)
Beispiel #15
0
    def __init__(self, n_ins=1024, hidden_layers_sizes=[500, 500], n_outs=10):

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer
            if i == 0:
                input_size = n_ins
                layer_input = self.x
            else:
                input_size = hidden_layers_sizes[i - 1]
                layer_input = self.sigmoid_layers[-1].output                

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.neg_log_hood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Beispiel #16
0
 def change_lastlayer(self,n_ins, n_outs):
     self.logLayer_b = LogisticRegression(
         input=self.sigmoid_layers[-1].output,
         n_in=n_ins, n_out=n_outs
     )
     self.params_b.pop()
     self.params_b.pop()
     
     self.params_b.extend(self.logLayer_b.params)
     self.finetune_cost_b = self.logLayer_b.negative_log_likelihood(self.y)
     self.errors_b = self.logLayer_b.errors(self.y)
 def __init__(self,rng, batch_size=100, input_size=None,
              nkerns=[4,4,4], receptive_fields=((2,8),(2,8),(2,8)), poolsizes=((1,8),(1,8),(1,4)),full_hidden=16, n_out=10):
     """
     
     """
     self.x = T.matrix(name='x',dtype=theano.config.floatX)   # the data is presented as rasterized images
     self.y = T.ivector('y')  # the labels are presented as 1D vector of
     self.batch_size = theano.shared(value=batch_size,name='batch_size')#T.lscalar('batch_size')
     
     self.layers=[]
     self.params=[]
     for i in range(len(nkerns)):
         receptive_field=receptive_fields[i]            
         if i==0:
             featmap_size_after_downsample=input_size
             layeri_input = self.x.reshape((batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1]))
             image_shape=(batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1])
             filter_shape=(nkerns[i], 1, receptive_field[0], receptive_field[1])
         else:
             layeri_input=self.layers[i-1].output
             image_shape=(batch_size, nkerns[i-1], featmap_size_after_downsample[0], featmap_size_after_downsample[1])
             filter_shape=(nkerns[i], nkerns[i-1], receptive_field[0], receptive_field[1])
             
         
         layeri = LeNetConvPoolLayer(rng=rng, input=layeri_input,
                                     image_shape=image_shape,
                                     filter_shape=filter_shape, poolsize=poolsizes[i])
         featmap_size_after_conv=get_featmap_size_after_conv(featmap_size_after_downsample,receptive_fields[i])
         featmap_size_after_downsample=get_featmap_size_after_downsample(featmap_size_after_conv,poolsizes[i])
         self.layers.append(layeri)
         self.params.extend(layeri.params)
     
     # fully connected layer
     print 'going to fully connected layer'
     layer_full_input = self.layers[-1].output.flatten(2)
     
     # construct a fully-connected sigmoidal layer
     layer_full = HiddenLayer(rng=rng, input=layer_full_input, 
                              n_in=nkerns[-1] * featmap_size_after_downsample[0] * featmap_size_after_downsample[1],
                              n_out=full_hidden, activation=T.tanh)
     self.layers.append(layer_full)
     self.params.extend(layer_full.params)
                      
     # classify the values of the fully-connected sigmoidal layer
     print 'going to output layer'
     self.logRegressionLayer = LogisticRegression(input=self.layers[-1].output, n_in=full_hidden, n_out=n_out)
     self.params.extend(self.logRegressionLayer.params)
     
     # the cost we minimize during training is the NLL of the model
     self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(self.y)
     self.cost = self.logRegressionLayer.negative_log_likelihood(self.y)
     self.errors = self.logRegressionLayer.errors(self.y)
     self.y_pred = self.logRegressionLayer.y_pred
Beispiel #18
0
def load_trained_model():
    global if_load_trained_model
    global train_model_route 
    global classifier
    global validate_model
    global validate_results

    if_load_trained_model = 1
    print "loading trained model for the first time"
    trained_model_pkl = open(train_model_route, 'r')
    trained_model_state_list = cPickle.load(trained_model_pkl)
    trained_model_state_array = numpy.load(trained_model_pkl)
    classifier_state = trained_model_state_array[0]

    classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape
                                    , W=classifier_state[0], b=classifier_state[1])

    # definition for theano.function
    validate_model = theano.function(inputs=[x, y],
            outputs=classifier.errors(y))
    validate_results = theano.function(inputs=[x],
            outputs=classifier.y_pred)
Beispiel #19
0
    def reconstruct_loglayer(self, n_outs = 10):
        """ Reconstruct a logistic layer on top of a previously trained SdA """
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.dA_layers[-1].output,
                         n_in=self.dA_layers[-1].n_hidden, n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)        
Beispiel #20
0
    def __init__(self, rng, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will
        # translate into a TanhLayer connected to the LogisticRegression
        # layer; this can be replaced by a SigmoidalLayer, or a layer
        # implementing any other nonlinearity
        self.hiddenLayer = HiddenLayer(
            rng=rng,
            n_in=n_in, n_out=n_hidden,
            activation=numpy.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            n_in=n_hidden,
            n_out=n_out)

        ## the parameters of the model are the parameters of the two layer it is
        ## made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
def Buildnet(params, nkerns=[20, 50], batch_size=500):

    rng = numpy.random.RandomState(23455)

    datasets = load_data(0)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    f = theano.function(
        inputs=[index],
        outputs=[layer2.output, layer3.y_pred, y],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    #    numepoch = len(params)
    layer3.W.set_value(params[-1][0])
    layer3.b.set_value(params[-1][1])
    layer2.W.set_value(params[-1][2])
    layer2.b.set_value(params[-1][3])
    layer1.W.set_value(params[-1][4])
    layer1.b.set_value(params[-1][5])
    layer0.W.set_value(params[-1][6])
    layer0.b.set_value(params[-1][7])

    outputvectors = numpy.zeros((10000, 500))
    labels = numpy.zeros((10000, 1))
    reallabels = numpy.zeros((10000, 1))

    for minibatch_index in xrange(n_test_batches):

        vector, label, reallabel = f(minibatch_index)

        outputvectors[minibatch_index * batch_size:(minibatch_index + 1) *
                      batch_size] = vector
        labels[minibatch_index * batch_size:(minibatch_index + 1) * batch_size,
               0] = label
        reallabels[minibatch_index * batch_size:(minibatch_index + 1) *
                   batch_size, 0] = reallabel

    return [outputvectors, labels, reallabels]
Beispiel #22
0
    def __init__(self, rng, input, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        self.hiddenLayer = HiddenLayer(rng=rng,
                                       input=input,
                                       n_in=n_in,
                                       n_out=n_hidden,
                                       activation=T.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out, rng=rng)
        # end-snippet-2 start-snippet-3
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = (abs(self.hiddenLayer.W).sum() +
                   abs(self.logRegressionLayer.W).sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = ((self.hiddenLayer.W**2).sum() +
                       (self.logRegressionLayer.W**2).sum())

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood)
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        # end-snippet-3

        # keep track of model input
        self.input = input
Beispiel #23
0
    def __init__(self, rng, input, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        self.hiddenLayer = HiddenLayer(rng=rng, input=input,
                                       n_in=n_in, n_out=n_hidden,
                                       activation=T.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output,
            n_in=n_hidden,
            n_out=n_out)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = abs(self.hiddenLayer.W).sum() \
                + abs(self.logRegressionLayer.W).sum()

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \
                    + (self.logRegressionLayer.W ** 2).sum()

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors
        self.pp_errors = self.logRegressionLayer.pp_errors
        # the parameters of the model are the parameters of the two layer it is
        # made out of
        
        # compute vector of class-membership probabilities in symbolic form
        self.p_y_given_x = self.logRegressionLayer.p_y_given_x
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
        
        self.max_prob = self.p_y_given_x[T.arange(input.shape[0]),self.y_pred]
        
        self.classify = theano.function(inputs=[input], outputs=[self.y_pred, self.max_prob])

        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id=load_il6_with_BBN(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    reliefweb_all_sentences, reliefweb_all_masks, reliefweb_all_labels, word2id=load_reliefweb_as_multilabel_dataset(maxSentLen,word2id)
    reliefweb_all_sentences_train=np.asarray(reliefweb_all_sentences[0], dtype='int32')
    reliefweb_all_masks_train=np.asarray(reliefweb_all_masks[0], dtype=theano.config.floatX)
    reliefweb_all_labels_train=np.asarray(reliefweb_all_labels[0], dtype='int32')

    reliefweb_all_sentences_dev=np.asarray(reliefweb_all_sentences[1], dtype='int32')
    reliefweb_all_masks_dev=np.asarray(reliefweb_all_masks[1], dtype=theano.config.floatX)
    reliefweb_all_labels_dev=np.asarray(reliefweb_all_labels[1], dtype='int32')

    reliefweb_all_sentences_test=np.asarray(reliefweb_all_sentences[2], dtype='int32')
    reliefweb_all_masks_test=np.asarray(reliefweb_all_masks[2], dtype=theano.config.floatX)
    reliefweb_all_labels_test=np.asarray(reliefweb_all_labels[2], dtype='int32')

    train_sents=np.asarray(all_sentences[0], dtype='int32')
    train_masks=np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels=np.asarray(all_labels[0], dtype='int32')

    train_sents = np.concatenate((train_sents,reliefweb_all_sentences_train,reliefweb_all_sentences_dev,reliefweb_all_sentences_test),axis=0)
    train_masks = np.concatenate((train_masks,reliefweb_all_masks_train,reliefweb_all_masks_dev,reliefweb_all_masks_test),axis=0)
    train_labels = np.concatenate((train_labels,reliefweb_all_labels_train,reliefweb_all_labels_dev,reliefweb_all_labels_test),axis=0)
    train_size=len(train_labels)

    dev_sents=np.asarray(all_sentences[1], dtype='int32')
    dev_masks=np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels=np.asarray(all_labels[1], dtype='int32')
    dev_size=len(dev_labels)

    test_sents=np.asarray(all_sentences[2], dtype='int32')
    test_masks=np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size=len(test_labels)

    vocab_size=  len(word2id)+1 # add one zero pad index

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_fasttext_word2vec_given_file([emb_root+'IL6-cca-wiki-lorelei-d40.eng.vec',emb_root+'IL6-cca-wiki-lorelei-d40.IL6.vec'], 40)
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    #now, start to build the input form of the model
    sents_id_matrix=T.imatrix('sents_id_matrix')
    sents_mask=T.fmatrix('sents_mask')
    labels=T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input*sents_mask.dimshuffle(0,'x',1),axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')



    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_model = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1)
    LR_input_size = hidden_size[0]*2+emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]
    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))


    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [embeddings]+NN_para+LR_para   # put all model parameters together
    cost=loss+1e-4*((conv_W**2).sum()+(conv_W2**2).sum())
    updates =   Gradient_Cost_Para(cost,params, learning_rate)

    '''
    testing
    '''
    binarize_prob = T.where(score_matrix > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    # max_acc_dev=0.0
    max_meanf1_test=0.0
    max_weightf1_test=0.0
    train_indices = range(train_size)
    cost_i=0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]

            cost_i+= train_model(
                                train_sents[train_id_batch],
                                train_masks[train_id_batch],
                                train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if  iter%20==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()

                error_sum=0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start: # for each test batch
                    pred_labels=test_model(
                                test_sents[test_batch_id:test_batch_id+batch_size],
                                test_masks[test_batch_id:test_batch_id+batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id+batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)


                test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test=test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test=test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test


        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Beispiel #25
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    # parameterize training data set dimensions
    n_in = train_set_x.get_value(borrow=True).shape[1]
    n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1
    # print n_in, n_out

    classifier = LogisticRegression(input=x, n_in=n_in, n_out=n_out)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    #### change learning rate ###
    delta = numpy.random.gamma(1)
    learning_rate *= delta
    #### end ###

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.

    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print '... training the model'
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                '''print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )'''

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    '''print(
                        (
                            '     epoch %i, minibatch %i/%i, test error of'
                            ' best model %f %%'
                        ) %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )'''
                    print test_score * 100.

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    '''print(
        (
            'Optimization complete with best validation score of %f %%,'
            'with test performance %f %%'
        )
        % (best_validation_loss * 100., test_score * 100.)
    )
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))'''
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
Beispiel #26
0
class DBN(object):
    """Deep Belief Network
    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """
    def __init__(self,
                 rng,
                 n_in=784,
                 n_hidden=[500, 500],
                 n_out=10,
                 lambda_reg=0.001,
                 alpha_reg=0.001):
        """This class is made to support a variable number of layers.
    
        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                   weights
    
        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_in: int
        :param n_in: dimension of the input to the DBN
    
        :type n_hidden: list of ints
        :param n_hidden: intermediate layers size, must contain
                               at least one value

        :type n_out: int
        :param n_out: dimension of the output of the network
       
        :type lambda_reg: float
        :param lambda_reg: paramter to control the sparsity of weights by l_1 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ).
         Thus, the larger lambda_reg is, the sparser the weights are.
        
        :type alpha_reg: float
        :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ),
         Thus, the smaller alpha_reg is, the smoother the weights are.
        """

        self.hidden_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(n_hidden)

        assert self.n_layers > 0

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data, each row is a sample
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_in
            else:
                input_size = n_hidden[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.hidden_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=n_hidden[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.hidden_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=rng,
                            theano_rng=None,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=n_hidden[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        if self.n_layers > 0:
            self.logRegressionLayer = LogisticRegression(
                input=self.hidden_layers[-1].output,
                n_in=n_hidden[-1],
                n_out=n_out)
        else:
            self.logRegressionLayer = LogisticRegression(input=self.x,
                                                         n_in=input_size,
                                                         n_out=n_out)

        self.params.extend(self.logRegressionLayer.params)

        # regularization
        L1s = []
        L2_sqrs = []
        for i in range(self.n_layers):
            L1s.append(abs(self.hidden_layers[i].W).sum())
            L2_sqrs.append((self.hidden_layers[i].W**2).sum())
        L1s.append(abs(self.logRegressionLayer.W).sum())
        L2_sqrs.append((self.logRegressionLayer.W**2).sum())
        self.L1 = T.sum(L1s)
        self.L2_sqr = T.sum(L2_sqrs)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost=self.negative_log_likelihood + \
        lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr +  alpha_reg*self.L1)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred

    def pretraining_functions(self, train_set_x, batch_size, persistent_k=15):
        '''
        Build the symbolic pretraining functions to update the parameter in one iteration.
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('learning_rate')  # learning rate to use
        # number of batches
        #n_batches = int(math.ceil(train_set_x.get_value(borrow=True).shape[0] / batch_size))
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm_layer in self.rbm_layers:
            # get the cost and the updates list
            cost, updates = rbm_layer.get_cost_updates(learning_rate,
                                                       persistent=None,
                                                       k=persistent_k)
            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[batch_begin:batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, train_set_x, train_set_y, valid_set_x,
                                 valid_set_y, batch_size,
                                 learning_rate_shared):
        '''
        Build the symbolic finetuning functions to update the parameters in one iteration. 
        Validation function is also defined.
        '''
        # compute number of minibatches for training, validation and testing
        n_valid_batches = int(
            math.ceil(
                valid_set_x.get_value(borrow=True).shape[0] / batch_size))

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.cost, self.params)
        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate_shared))

        train_fn = theano.function(
            inputs=[index],
            outputs=self.cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='train')

        #        test_score_i = theano.function([index], self.errors,
        #                 givens={
        #                   self.x: test_set_x[index * batch_size:
        #                                      (index + 1) * batch_size],
        #                   self.y: test_set_y[index * batch_size:
        #                                      (index + 1) * batch_size]},
        #                      name='test')

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='valid')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set


#        def test_score():
#            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score

    def build_test_function(self, test_set_x, batch_size):
        """
        Build the symbolic test function.
        """
        n_test_batches = int(
            math.ceil(test_set_x.get_value(borrow=True).shape[0] / batch_size))
        index = T.lscalar('index')  # index to a [mini]batch
        test_score_i = theano.function(
            [index],
            self.y_pred,
            givens={
                self.x: test_set_x[index * batch_size:(index + 1) * batch_size]
            },
            name='test')

        # Create a function that scans the entire test set
        def test_score():
            y_pred = []
            for i in xrange(n_test_batches):
                y_pred.extend(test_score_i(i))
            return y_pred

        return test_score

    def get_params(self):
        return copy.deepcopy(self.params)

    def set_params(self, given_params):
        self.params = given_params

    def print_params(self):
        for param in self.params:
            print param.get_value(borrow=True)

    def save_params(self, filename):
        f = open(filename, 'w')  # remove existing file
        f.close()
        f = open(filename, 'a')
        for param in self.params:
            pickle.dump(param.get_value(borrow=True), f)
        f.close()
Beispiel #27
0
    def __init__(self, ds=None, nkerns=[32, 48], batch_size=100, normalized_width=0, distortion=0,
                    params=[None, None,None, None,None, None,None, None]):

        #layers for train:
        #layer3_W, layer3_b on L1-Convolutional Layer with  20 by 26x26
        #layer2_W, layer2_b on L2-Convolutional Layer with 40 by 9x9
        #layer1_W, layer1_b on HiddenLayer - fully-connected
        #layer0_W, layer0_b on LogisticRegression - output layer

        layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = params        
        
        rng = numpy.random.RandomState(23455)

        #dataset by param - the load data was executed before on function call
        train_set_x, train_set_y = ds[0]
        valid_set_x, valid_set_y = ds[1]
        test_set_x, test_set_y   = ds[2]

        # compute number of minibatches for training, validation and testing
        self.n_train_batches  = train_set_x.get_value(borrow=True).shape[0]
        self.n_valid_batches  = valid_set_x.get_value(borrow=True).shape[0]
        self.n_test_batches   = test_set_x.get_value(borrow=True).shape[0]
        
        self.n_train_batches /= batch_size
        self.n_valid_batches /= batch_size
        self.n_test_batches  /= batch_size


        index = T.lscalar()
        learning_rate = T.fscalar()

        # start-snippet-1
        x = T.matrix('x')   # the data is presented as rasterized images
        y = T.ivector('y')  # the labels are presented as 1D vector of
                            # [int] labels

        ######################
        # BUILD ACTUAL MODEL #
        ######################

        print '... building the dnn column'

        layer0_input = x.reshape((batch_size, 1, 29, 29))

        # Construct the first convolutional pooling layer
        layer0 = LeNetConvPoolLayer(
            rng,
            input=layer0_input,
            image_shape=(batch_size, 1, 29, 29),
            filter_shape=(nkerns[0], 1, 4, 4),
            poolsize=(2, 2),
            W=layer0_W,
            b=layer0_b
        )

        # Construct the second convolutional pooling layer
        layer1 = LeNetConvPoolLayer(
            rng,
            input=layer0.output,
            image_shape=(batch_size, nkerns[0], 13, 13),
            filter_shape=(nkerns[1], nkerns[0], 5, 5),
            poolsize=(3, 3),
            W=layer1_W,
            b=layer1_b
        )

        layer2_input = layer1.output.flatten(2)

        # construct a fully-connected sigmoidal layer
        layer2 = HiddenLayer(
            rng,
            input=layer2_input,
            n_in=nkerns[1] * 3 * 3,
            n_out=150,
            W=layer2_W,
            b=layer2_b,
            activation=T.tanh
        )
        
        # contruct the output layer
        # classification of values from fully-connected sigmoidal layer
        layer3 = LogisticRegression(
            input=layer2.output, 
            n_in=150, 
            n_out=10,
            W=layer3_W,
            b=layer3_b
        )

        cost = layer3.negative_log_likelihood(y)

        # compute the mistakes that are made by the model
        self.test_model = theano.function(
            [index],
            layer3.errors(y),
            givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        # compute probabilities of all output classes - on validation set
        self.valid_output_batch = theano.function(
            [index],
            layer3.p_y_given_x,
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size]
            }
        )

        # compute probabilities of all output classes
        self.test_output_batch = theano.function(
            [index],
            layer3.p_y_given_x,
            givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size]
            }
        )

        # compute the mistakes on validate set that are made by model
        self.validate_model = theano.function(
            [index],
            layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        self.params = layer3.params + layer2.params + layer1.params + layer0.params

        # save the params to use for reference on test set
        # those will be use on test_mcdnn
        self.column_params = [nkerns, batch_size, normalized_width, distortion]

        grads  = T.grad(cost, self.params)

        updates = [
            (param_i, param_i - learning_rate * grad_i)
            for param_i, grad_i in zip(self.params, grads)
        ]

        # train the model
        self.train_model = theano.function(
            [index, learning_rate],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size: (index + 1) * batch_size],
                y: train_set_y[index * batch_size: (index + 1) * batch_size]
            }
        )
Beispiel #28
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset="mnist.pkl.gz", nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(4676148)

    ##########################
    # OUR DATA LOADING
    ##########################

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    width = 300
    height = 200

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print "... building the model"

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, width, height))  # CHANGED

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)

    # so then for us:
    # height-pool+1, so 300-5+1, 200-5+1 = 296,196
    # and then / poolsize, so we increase it to 4, and 74,49
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, width, height),  # CHANGED
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(4, 4),  # CHANGED
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)

    # math for us:
    # -6+1 en -6+1 gives us 70, 44
    # / 2 = gives us 16, 11
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 74, 49),  # CHANGED
        filter_shape=(nkerns[1], nkerns[0], 6, 6),  # CHANGED +1
        poolsize=(4, 4),  # CHANGED
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng, input=layer2_input, n_in=nkerns[1] * 16 * 11, n_out=500, activation=T.tanh  # CHANGED  # CHANGE?
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=4544)  # CHANGED

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size : (index + 1) * batch_size],
            y: test_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size : (index + 1) * batch_size],
            y: valid_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print "... training"
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.0
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print "training @ iter = ", iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print (
                    "epoch %i, minibatch %i/%i, validation error %f %%"
                    % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0)
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print (
                        ("     epoch %i, minibatch %i/%i, test error of " "best model %f %%")
                        % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.0)
                    )

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print ("Optimization complete.")
    print (
        "Best validation score of %f %% obtained at iteration %i, "
        "with test performance %f %%" % (best_validation_loss * 100.0, best_iter + 1, test_score * 100.0)
    )
    print >> sys.stderr, (
        "The code for file " + os.path.split(__file__)[1] + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    )
    def __init__(self,
                 numpy_rng=None,
                 theano_rng=None,
                 hidden_layers_sizes=[10],
                 finetune_lr=1,
                 pretraining_epochs=30,
                 pretrain_lr=.1,
                 k=1,
                 training_epochs=500,
                 pickle_dataset='numerai.pkl.gz',
                 batch_size=100,
                 L1=0.0000,
                 L2=0.0000,
                 activation=T.nnet.sigmoid,
                 patience=100,
                 patience_increase=20,
                 rand_seed=8675309,
                 verbose=False,
                 enforce_train_supremacy=True):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.rand_seed = rand_seed
        if not numpy_rng:
            self.numpy_rng = numpy.random.RandomState(self.rand_seed)
        if not theano_rng:
            self.theano_rng = MRG_RandomStreams(self.rand_seed)
        self.hidden_layers_sizes = hidden_layers_sizes
        self.finetune_lr = finetune_lr
        self.pretraining_epochs = pretraining_epochs
        self.pretrain_lr = pretrain_lr
        self.k = k
        self.training_epochs = training_epochs
        self.pickle_dataset = pickle_dataset
        self.batch_size = batch_size
        self.L1 = L1
        self.L2 = L2
        self.activation = activation
        self.patience = patience
        self.patience_increase = patience_increase
        self.verbose = verbose
        self.enforce_train_supremacy = enforce_train_supremacy

        self.train_set_x, self.train_set_y, self.valid_set_x, self.valid_set_y, self.test_set_x, self.test_set_y, self.score_set_x = self.load_dataset(
            self.pickle_dataset)

        assert self.n_layers > 0

        self.n_outs = int(max(self.train_set_y.eval()) + 1)
        self.n_ins = int(self.train_set_x.get_value(borrow=True).shape[1])

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
        # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = self.n_ins
            else:
                input_size = self.hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=self.numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        activation=self.activation)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=self.numpy_rng,
                            theano_rng=self.theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=self.hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=self.hidden_layers_sizes[-1],
            n_out=self.n_outs)

        self.params.extend(self.logLayer.params)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small

        L1_cost = 0
        L2_cost = 0

        for layer in self.sigmoid_layers:
            L1_cost += abs(layer.W).sum()
            L2_cost += abs(layer.W**2).sum()

        L1_cost += abs(self.logLayer.W).sum()
        L2_cost += abs(self.logLayer.W**2).sum()

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(
            self.y) + (self.L1 * L1_cost) + (self.L2 * L2_cost)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
class DBN(object):
    """Deep Belief Network

    ADAPTED FROM BELOW BY RYAN SCHORK

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """
    def __init__(self,
                 numpy_rng=None,
                 theano_rng=None,
                 hidden_layers_sizes=[10],
                 finetune_lr=1,
                 pretraining_epochs=30,
                 pretrain_lr=.1,
                 k=1,
                 training_epochs=500,
                 pickle_dataset='numerai.pkl.gz',
                 batch_size=100,
                 L1=0.0000,
                 L2=0.0000,
                 activation=T.nnet.sigmoid,
                 patience=100,
                 patience_increase=20,
                 rand_seed=8675309,
                 verbose=False,
                 enforce_train_supremacy=True):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.rand_seed = rand_seed
        if not numpy_rng:
            self.numpy_rng = numpy.random.RandomState(self.rand_seed)
        if not theano_rng:
            self.theano_rng = MRG_RandomStreams(self.rand_seed)
        self.hidden_layers_sizes = hidden_layers_sizes
        self.finetune_lr = finetune_lr
        self.pretraining_epochs = pretraining_epochs
        self.pretrain_lr = pretrain_lr
        self.k = k
        self.training_epochs = training_epochs
        self.pickle_dataset = pickle_dataset
        self.batch_size = batch_size
        self.L1 = L1
        self.L2 = L2
        self.activation = activation
        self.patience = patience
        self.patience_increase = patience_increase
        self.verbose = verbose
        self.enforce_train_supremacy = enforce_train_supremacy

        self.train_set_x, self.train_set_y, self.valid_set_x, self.valid_set_y, self.test_set_x, self.test_set_y, self.score_set_x = self.load_dataset(
            self.pickle_dataset)

        assert self.n_layers > 0

        self.n_outs = int(max(self.train_set_y.eval()) + 1)
        self.n_ins = int(self.train_set_x.get_value(borrow=True).shape[1])

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
        # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = self.n_ins
            else:
                input_size = self.hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=self.numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=self.hidden_layers_sizes[i],
                                        activation=self.activation)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=self.numpy_rng,
                            theano_rng=self.theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=self.hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=self.hidden_layers_sizes[-1],
            n_out=self.n_outs)

        self.params.extend(self.logLayer.params)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small

        L1_cost = 0
        L2_cost = 0

        for layer in self.sigmoid_layers:
            L1_cost += abs(layer.W).sum()
            L2_cost += abs(layer.W**2).sum()

        L1_cost += abs(self.logLayer.W).sum()
        L2_cost += abs(self.logLayer.W**2).sum()

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(
            self.y) + (self.L1 * L1_cost) + (self.L2 * L2_cost)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def load_dataset(self, dataset):
        # Load the dataset
        import gzip
        import cPickle
        f = gzip.open(dataset, 'rb')
        train_set, valid_set, test_set, score_set = cPickle.load(f)
        f.close()

        #train_set, valid_set, test_set format: tuple(input, target)
        #input is an numpy.ndarray of 2 dimensions (a matrix)
        #witch row's correspond to an example. target is a
        #numpy.ndarray of 1 dimensions (vector)) that have the same length as
        #the number of rows in the input. It should give the target
        #target to the example with the same index in the input.

        def shared_dataset(data_xy, borrow=True):
            """ Function that loads the dataset into shared variables

            The reason we store our dataset in shared variables is to allow
            Theano to copy it into the GPU memory (when code is run on GPU).
            Since copying data into the GPU is slow, copying a minibatch everytime
            is needed (the default behaviour if the data is not in a shared
            variable) would lead to a large decrease in performance.
            """
            data_x, data_y = data_xy
            shared_x = theano.shared(numpy.asarray(data_x,
                                                   dtype=theano.config.floatX),
                                     borrow=borrow)
            shared_y = theano.shared(numpy.asarray(data_y,
                                                   dtype=theano.config.floatX),
                                     borrow=borrow)
            # When storing data on the GPU it has to be stored as floats
            # therefore we will store the labels as ``floatX`` as well
            # (``shared_y`` does exactly that). But during our computations
            # we need them as ints (we use labels as index, and if they are
            # floats it doesn't make sense) therefore instead of returning
            # ``shared_y`` we will have to cast it to int. This little hack
            # lets ous get around this issue
            return shared_x, T.cast(shared_y, 'int32')

        test_set_x, test_set_y = shared_dataset(test_set)
        valid_set_x, valid_set_y = shared_dataset(valid_set)
        train_set_x, train_set_y = shared_dataset(train_set)

        score_set_x = theano.shared(numpy.asarray(score_set,
                                                  dtype=theano.config.floatX),
                                    borrow=True)

        return [
            train_set_x, train_set_y, valid_set_x, valid_set_y, test_set_x,
            test_set_y, score_set_x
        ]

    def pretraining_functions(self):
        '''Generates a list of functions, for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used
                            for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k

        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # number of batches
        n_batches = self.train_set_x.get_value(
            borrow=True).shape[0] / self.batch_size
        # begining of a batch, given `index`
        batch_begin = index * self.batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + self.batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None,
                                                 k=self.k)

            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={self.x: self.train_set_x[batch_begin:batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage

        '''

        # compute number of minibatches for training, validation and testing
        n_valid_batches = self.valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= self.batch_size
        n_test_batches = self.test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= self.batch_size
        num_train_batches = self.train_set_x.get_value(borrow=True).shape[0]
        num_train_batches /= self.batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * self.finetune_lr))

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                self.train_set_x[index * self.batch_size:(index + 1) *
                                 self.batch_size],
                self.y:
                self.train_set_y[index * self.batch_size:(index + 1) *
                                 self.batch_size]
            })

        train_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                self.train_set_x[index * self.batch_size:(index + 1) *
                                 self.batch_size],
                self.y:
                self.train_set_y[index * self.batch_size:(index + 1) *
                                 self.batch_size]
            })

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                self.test_set_x[index * self.batch_size:(index + 1) *
                                self.batch_size],
                self.y:
                self.test_set_y[index * self.batch_size:(index + 1) *
                                self.batch_size]
            })

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                self.valid_set_x[index * self.batch_size:(index + 1) *
                                 self.batch_size],
                self.y:
                self.valid_set_y[index * self.batch_size:(index + 1) *
                                 self.batch_size]
            })

        score_score_raw = theano.function([],
                                          self.logLayer.p_y_given_x,
                                          givens={self.x: self.score_set_x})

        score_score_outcome = theano.function(
            [], self.logLayer.y_pred, givens={self.x: self.score_set_x})

        score_train_raw = theano.function([],
                                          self.logLayer.p_y_given_x,
                                          givens={self.x: self.train_set_x})

        score_train_outcome = theano.function(
            [], self.logLayer.y_pred, givens={self.x: self.train_set_x})

        # Create a function that scans the entire training set
        def train_score():
            return [train_score_i(i) for i in xrange(num_train_batches)]

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        def make_predictions():
            return score_score_raw(), score_score_outcome(), score_train_raw(
            ), score_train_outcome()

        return train_fn, train_score, valid_score, test_score, make_predictions

    def save_predictions_raw(self):

        if type(self.raw_predictions) != numpy.ndarray:
            print 'Train the model before saving predictions!!'
        else:
            print 'saving file ' + 'DBN_' + self.pickle_dataset.strip(
                '.gz').strip('.pkl') + '_' + str(
                    self.rand_seed) + '_' + 'pred.csv'
            numpy.savetxt(
                'DBN_' + self.pickle_dataset.strip('.gz').strip('.pkl') + '_' +
                str(self.rand_seed) + '_' + 'pred_raw.csv',
                self.raw_predictions,
                delimiter=",")

    def save_predictions_outcome(self):

        if type(self.outcome_predictions) != numpy.ndarray:
            print 'Train the model before saving predictions!!'
        else:
            print 'saving file ' + 'DBN_' + self.pickle_dataset.strip(
                '.gz').strip('.pkl') + '_' + str(
                    self.rand_seed) + '_' + 'pred.csv'
            numpy.savetxt(
                'DBN_' + self.pickle_dataset.strip('.gz').strip('.pkl') + '_' +
                str(self.rand_seed) + '_' + 'pred_outcome.csv',
                self.outcome_predictions,
                delimiter=",")

    def train(self):
        """
        Demonstrates how to train and test a Deep Belief Network.

        This is demonstrated on MNIST.

        :type finetune_lr: float
        :param finetune_lr: learning rate used in the finetune stage
        :type pretraining_epochs: int
        :param pretraining_epochs: number of epoch to do pretraining
        :type pretrain_lr: float
        :param pretrain_lr: learning rate to be used during pre-training
        :type k: int
        :param k: number of Gibbs steps in CD/PCD
        :type training_epochs: int
        :param training_epochs: maximal number of iterations ot run the optimizer
        :type dataset: string
        :param dataset: path the the pickled dataset
        :type batch_size: int
        :param batch_size: the size of a minibatch
        """

        # compute number of minibatches for training, validation and testing
        n_train_batches = self.train_set_x.get_value(
            borrow=True).shape[0] / self.batch_size

        # numpy random generator
        print '... building the model'
        # construct the Deep Belief Network

        # start-snippet-2
        #########################
        # PRETRAINING THE MODEL #
        #########################
        print '... getting the pretraining functions'
        pretraining_fns = self.pretraining_functions()

        print '... pre-training the model'
        start_time = timeit.default_timer()
        ## Pre-train layer-wise
        for i in xrange(self.n_layers):
            # go through pretraining epochs
            for epoch in xrange(self.pretraining_epochs):
                # go through the training set
                c = []
                for batch_index in xrange(n_train_batches):
                    c.append(pretraining_fns[i](index=batch_index,
                                                lr=self.pretrain_lr))
                if self.verbose:
                    print 'Pre-training layer %i, epoch %d, cost ' % (i,
                                                                      epoch),
                    print numpy.mean(c)

        end_time = timeit.default_timer()
        # end-snippet-2
        print >> sys.stderr, ('The pretraining code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.2fm' %
                              ((end_time - start_time) / 60.))
        ########################
        # FINETUNING THE MODEL #
        ########################

        # get the training, validation and testing function for the model
        print '... getting the finetuning functions'
        train_fn, train_model, validate_model, test_model, score_model = self.build_finetune_functions(
        )

        print '... finetuning the model'
        # early-stopping parameters
        patience = self.patience * n_train_batches  # look as this many examples regardless
        patience_increase = float(
            self.patience_increase)  # wait this much longer when a new best is
        # found
        improvement_threshold = 0.9995  # a relative improvement of this much is
        # considered significant
        validation_frequency = min(n_train_batches, patience / 2)
        # go through this many
        # minibatches before checking the network
        # on the validation set; in this case we
        # check every epoch

        best_validation_loss = numpy.inf
        start_time = timeit.default_timer()

        done_looping = False
        epoch = 0
        self.raw_predictions = None
        self.outcome_predictions = None
        self.raw_predictions_train = None
        self.outcome_predictions_train = None

        while (epoch < self.training_epochs) and (not done_looping):
            epoch = epoch + 1
            for minibatch_index in xrange(n_train_batches):

                minibatch_avg_cost = train_fn(minibatch_index)
                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:

                    train_losses = train_model()
                    this_training_loss = numpy.mean(train_losses)

                    test_losses = test_model()
                    this_testing_loss = numpy.mean(test_losses)

                    validation_losses = validate_model()
                    this_validation_loss = numpy.mean(validation_losses)

                    if self.verbose:

                        print(
                            'epoch {}, minibatch {}/{}, training error {:.2f} %, validation error {:.2f} %, testing error {:.2f} %'
                            .format(epoch, minibatch_index + 1,
                                    n_train_batches, this_training_loss * 100.,
                                    this_validation_loss * 100.,
                                    this_testing_loss * 100.))

                    # if we got the best validation score until now

                    train_gate = True
                    valid_gate = (
                        best_validation_loss - this_validation_loss >
                        0.01) or ((abs(this_validation_loss -
                                       best_validation_loss) < .0000001) and
                                  (this_training_loss < corr_training_loss))
                    if self.enforce_train_supremacy:
                        train_gate = this_training_loss < this_validation_loss

                    if valid_gate and train_gate:
                        #improve patience if loss improvement is good enough
                        if (this_validation_loss < best_validation_loss):
                            patience = max(patience, iter * patience_increase)

                        # save best validation score and iteration number
                        corr_testing_loss = this_testing_loss
                        best_validation_loss = this_validation_loss
                        corr_training_loss = this_training_loss
                        best_epoch = epoch
                        self.raw_predictions, self.outcome_predictions, self.raw_predictions_train, self.outcome_predictions_train = score_model(
                        )
                        best_iter = iter

                        # for element in dbn.params:
                        #     print element.get_value()
                        print(('     epoch %i, minibatch %i/%i, test error of '
                               'best model %f %%') %
                              (epoch, minibatch_index + 1, n_train_batches,
                               corr_testing_loss * 100.))

                if patience <= iter:
                    done_looping = True
                    break

        end_time = timeit.default_timer()
        print(('Optimization complete with best validation score of %.2f %%, '
               'obtained at epoch %i iteration %i, '
               'with test performance %.2f %% '
               'with training performance %.2f %% ') %
              (best_validation_loss * 100., best_epoch, best_iter + 1,
               corr_testing_loss * 100., corr_training_loss * 100.))
        print >> sys.stderr, ('The fine tuning code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.2fm' %
                              ((end_time - start_time) / 60.))
Beispiel #32
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels
        # end-snippet-1

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        # start-snippet-2
        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        # end-snippet-2
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Beispiel #33
0
class SdA(object):
    """Stacked denoising auto-encoder class (SdA)

    A stacked denoising autoencoder model is obtained by stacking several
    dAs. The hidden layer of the dA at layer `i` becomes the input of
    the dA at layer `i+1`. The first layer dA gets as input the input of
    the SdA, and the hidden layer of the last dA represents the output.
    Note that after pretraining, the SdA is dealt with as a normal MLP,
    the dAs are only used to initialize the weights.
    """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels
        # end-snippet-1

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        # start-snippet-2
        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        # end-snippet-2
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size):
        ''' Generates a list of functions, each of them implementing one
        step in trainnig the dA corresponding to the layer with same index.
        The function will require as input the minibatch index, and to train
        a dA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared variable that contains all datapoints used
                            for training the dA

        :type batch_size: int
        :param batch_size: size of a [mini]batch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the dA layers
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            # get the cost and the updates list
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)
            # compile the theano function
            fn = theano.function(
                inputs=[
                    index,
                    theano.Param(corruption_level, default=0.2),
                    theano.Param(learning_rate, default=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[batch_begin:batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on
        a batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = [(param, param - gparam * learning_rate)
                   for param, gparam in zip(self.params, gparams)]

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='train')

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='test')

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='valid')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
    def __init__(self,numpy_rng,theano_rng=None,n_ins=784,
                 hidden_layers_sizes=[500,500],n_outs=10,
                 corruption_levels=[0.1,0.1]):
        """
        该类可以构造可变层数的网络
        numpy_rng:numpy.random.RandomState  用于初始化权重的随机数

        theano_rng: theano.tensor.shared_randomstreams.RandomStreams
                    Theano随机生成数,如果,默认值为None, 则是由'rng'
                    生成的随机种子
        n_ins: int  SdA输入的维度
        hidden_layers_sizes: lists of ints 中间层的层数列表,最少一个元素
        n_out: int 网路输出量的维度
        corruption_levels: list of float 每一层的corruption level
        """

        self.sigmoid_layers=[]
        self.dA_layers=[]
        self.params=[]
        self.n_layers=len(hidden_layers_sizes)

        assert self.n_layers>0 #设定隐层数量大于0

        if not theano_rng:
            theano_rng=RandomStreams(numpy_rng.randint(2**30))

        #设定符号变量
        self.x=T.matrix('x') #栅格化的图像数据
        self.y=T.ivector('y') #由[int]型标签组成的一维向量

        #SdA是一个MLP,降噪自编码器共享中间层的权重向量。
        #首先将SdA构造为深层多感知器,然后构造每个sigmoid层。
        #同时,该层的降噪自编码器也会共享权重。
        #预训练过程是训练这些自编码器(同时也会改变多感知器的权重)
        #在微调过程,通过在MLP上采用随机梯度下降法完成SdA的训练

        #构造sigmoid层
        for i in xrange(self.n_layers):
            #输入量的大小是下层隐层单元数量(本层不是第一层)
            #输入量的大小是输入量的大小(本层是第一层)
            if i==0:
                input_size=n_ins
            else:
                input_size=hidden_layers_sizes[i-1]

            #本层的输入是下层隐层的激活(本层不是第一层);
            #本层的输入是SdA的输入(本层是第一层)
            if i==0:
                layer_input=self.x
            else:
                layer_input=self.sigmoid_layers[-1].output

            #定义sigmoid层
            sigmoid_layer=HiddenLayer(rng=numpy_rng,
                                      input=layer_input,
                                      n_in=input_size,
                                      n_out=hidden_layers_sizes[i],
                                      activation=T.nnet.sigmoid)
            #将sigmoid层添加到层列表
            self.sigmoid_layers.append(sigmoid_layer)
            #这是个哲学问题...
            #但是我们只想说sigmoid_layers的参数就是
            #SdA的参数
            #dA中可视偏置是dA的参数,而不是SdA的参数
            self.params.extend(sigmoid_layer.params)

            #构造降噪自编码器与该层共享权重
            dA_layer=dA(numpy_rng=numpy_rng,
                        theano_rng=theano_rng,
                        input=layer_input,
                        n_visible=input_size,
                        n_hidden=hidden_layers_sizes[i],
                        W=sigmoid_layer.W,
                        bhid=sigmoid_layer.b
                        )
            self.dA_layers.append(dA_layer)

        #在MLP顶部加上losgistic层
        self.logLayer=LogisticRegression(
                        input=self.sigmoid_layers[-1].output,
                        n_in=hidden_layers_sizes[-1],n_out=n_outs)
        self.params.extend(self.logLayer.params)
        #建立函数,执行一步微调
        #定义第二步训练的代价:负log函数
        self.finetune_cost=self.logLayer.negative_log_likelihood(self.y)
        #分别对模型中参数计算梯度
        #给定self.x和self.y,定义每个minibatch上的误差的符号变量
        self.errors=self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    dev_sents, dev_sent_masks, dev_sent_labels, dev_claims, dev_claim_mask, dev_sent_names, dev_ground_names, dev_labels, word2id = load_fever_test(
        sent_len, claim_len, cand_size, word2id)
    dev_3th_sents, dev_3th_sent_masks, dev_3th_sent_labels, dev_3th_claims, dev_3th_claim_mask, dev_3th_labels, word2id = load_fever_test_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))
    dev_sents = np.asarray(dev_sents, dtype='int32')
    dev_3th_sents = np.asarray(dev_3th_sents, dtype='int32')
    joint_dev_sents = np.concatenate((dev_sents, dev_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))
    dev_sent_masks = np.asarray(dev_sent_masks, dtype=theano.config.floatX)
    dev_3th_sent_masks = np.asarray(dev_3th_sent_masks,
                                    dtype=theano.config.floatX)
    joint_dev_sent_masks = np.concatenate((dev_sent_masks, dev_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))
    dev_sent_labels = np.asarray(dev_sent_labels, dtype='int32')
    dev_3th_sent_labels = np.asarray(dev_3th_sent_labels, dtype='int32')
    joint_dev_sent_labels = np.concatenate(
        (dev_sent_labels, dev_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))
    dev_claims = np.asarray(dev_claims, dtype='int32')
    dev_3th_claims = np.asarray(dev_3th_claims, dtype='int32')
    joint_dev_claims = np.concatenate((dev_claims, dev_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))
    dev_claim_mask = np.asarray(dev_claim_mask, dtype=theano.config.floatX)
    dev_3th_claim_mask = np.asarray(dev_3th_claim_mask,
                                    dtype=theano.config.floatX)
    joint_dev_claim_mask = np.concatenate((dev_claim_mask, dev_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))
    dev_labels = np.asarray(dev_labels, dtype='int32')
    dev_3th_labels = np.asarray(dev_3th_labels, dtype='int32')
    joint_dev_labels = np.concatenate((dev_labels, dev_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    joint_dev_size = len(joint_dev_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    dev_size = len(dev_claims)
    test_3th_size = len(test_3th_claims)
    dev_3th_size = len(dev_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_dev size: ', joint_test_size, ' joint_test size: ', joint_dev_size
    print 'train size: ', train_size, ' dev size: ', test_size, ' test size: ', dev_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')
    dev_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                allow_input_downcast=True,
                                on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_dev_3th_batches = dev_3th_size / batch_size
    dev_3th_batch_start = list(np.arange(n_dev_3th_batches) *
                               batch_size) + [dev_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_test_acc = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)
                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                if f1 > max_test_f1 or strict_score > max_test_acc:
                    if f1 > max_test_f1:
                        max_test_f1 = f1
                    if strict_score > max_test_acc:
                        max_test_acc = strict_score
                    #test
                    print '....................\n'
                    f1_sum = 0.0
                    error_sum = 0.0
                    full_evi = 0
                    predictions = []
                    for dev_batch_id in dev_batch_start:  # for each test batch
                        batch_prob, error_i, pred_i = dev_model(
                            dev_sents[dev_batch_id:dev_batch_id + batch_size],
                            dev_sent_masks[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_sent_labels[dev_batch_id:dev_batch_id +
                                            batch_size],
                            dev_claims[dev_batch_id:dev_batch_id + batch_size],
                            dev_claim_mask[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_labels[dev_batch_id:dev_batch_id + batch_size])
                        error_sum += error_i
                        batch_sent_labels = dev_sent_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_sent_names = dev_sent_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_names = dev_ground_names[
                            dev_batch_id:dev_batch_id + batch_size]
                        batch_ground_labels = dev_labels[
                            dev_batch_id:dev_batch_id + batch_size]
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(
                                batch_ground_labels[i])
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            pred_sent_names = []
                            gold_sent_names = batch_ground_names[i]
                            zipped = [(batch_prob[i,
                                                  k], batch_sent_labels[i][k],
                                       batch_sent_names[i][k])
                                      for k in range(cand_size)]
                            sorted_zip = sorted(zipped,
                                                key=lambda x: x[0],
                                                reverse=True)
                            for j in range(cand_size):
                                triple = sorted_zip[j]
                                if triple[1] == 1.0:
                                    '''
                                    we should consider a rank, instead of binary
                                    if triple[0] >0.5: can control the recall, influence the strict_acc
                                    '''
                                    if triple[0] > 0.5:
                                        # pred_sent_names.append(batch_sent_names[i][j])
                                        pred_sent_names.append(triple[2])
                                    # if len(pred_sent_names) == max_pred_pick:
                                    #     break
                            instance_i['predicted_evidence'] = pred_sent_names
                            # print 'pred_sent_names:',pred_sent_names
                            # print 'gold_sent_names:',gold_sent_names
                            new_gold_names = []
                            for gold_name in gold_sent_names:
                                new_gold_names.append([None, None] + gold_name)
                            instance_i['evidence'] = [new_gold_names]
                            predictions.append(instance_i)
                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                    # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                    for dev_batch_id in dev_3th_batch_start:  # for each test batch
                        _, error_i, pred_i = dev_model(
                            dev_3th_sents[dev_batch_id:dev_batch_id +
                                          batch_size],
                            dev_3th_sent_masks[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_sent_labels[dev_batch_id:dev_batch_id +
                                                batch_size],
                            dev_3th_claims[dev_batch_id:dev_batch_id +
                                           batch_size],
                            dev_3th_claim_mask[dev_batch_id:dev_batch_id +
                                               batch_size],
                            dev_3th_labels[dev_batch_id:dev_batch_id +
                                           batch_size])
                        for i in range(batch_size):
                            instance_i = {}
                            instance_i['label'] = pred_id2label.get(2)
                            instance_i['predicted_label'] = pred_id2label.get(
                                pred_i[i])
                            instance_i['predicted_evidence'] = []
                            instance_i['evidence'] = []
                            predictions.append(instance_i)

                    strict_score, label_accuracy, precision, recall, f1 = fever_score(
                        predictions)
                    print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5],
                    hidden_units=50, L2_weight=0.000005, dropout_p=0.2, useEmb=1, task=2):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(23455)
    datasets, embedding_size, embeddings=read_data_HK(root+str(task)+'classes/train.txt', root+str(task)+'classes/dev.txt', root+str(task)+'classes/test.txt', embeddingPath,60, useEmb)
    #print embeddings.get_value(borrow=True)[:4]
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)
    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths = datasets[0]
    indices_dev, devY, devLengths = datasets[1]
    indices_test, testY, testLengths = datasets[2]
    n_train_batches=indices_train.shape[0]/batch_size
    n_valid_batches=indices_dev.shape[0]/batch_size
    n_test_batches=indices_test.shape[0]/batch_size
    
    train_batch_start=[]
    dev_batch_start=[]
    test_batch_start=[]
    if useAllSamples:
        train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[indices_train.shape[0]-batch_size]
        dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)+[indices_dev.shape[0]-batch_size]
        test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[indices_test.shape[0]-batch_size]
        n_train_batches=n_train_batches+1
        n_valid_batches=n_valid_batches+1
        n_test_batches=n_test_batches+1
    else:
        train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
        dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)
        test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
    indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
    indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True)
    indices_train_theano=T.cast(indices_train_theano, 'int32')
    indices_dev_theano=T.cast(indices_dev_theano, 'int32')
    indices_test_theano=T.cast(indices_test_theano, 'int32')
    '''
    indices_train_theano=theano.shared(indices_train, borrow=True)
    indices_dev_theano=theano.shared(indices_dev, borrow=True)
    indices_test_theano=theano.shared(indices_test, borrow=True)
    '''
    

    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
    y = T.ivector('y')  
    z = T.ivector('z') # sentence lengths

    
    x=embeddings[x_index.flatten()].reshape((batch_size,60, embedding_size)).transpose(0, 2, 1).flatten()#flatten(1) means keep col, then scan rows
    #x_print=theano.printing.Print('x')(x[:,1500:])
    ishape = (embedding_size, 60)  # this is the size of MNIST images
    filter_size1=(1,filter_size[0])
    filter_size2=(1,filter_size[1])
    poolsize1=(1, ishape[1]-filter_size1[1]+1)
    

    poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    dynamic_lengths=T.maximum(ktop,z/2+1)  # dynamic k-max pooling
    
    
    
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))
    #layer0_input_print=theano.printing.Print('input')(layer0_input[:,:,:,30:])
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    '''
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1)
    '''
    layer0 = Conv_DynamicK_PoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, sentLengths=z, k=dynamic_lengths, unifiedWidth=kmax)
    
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    '''
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    '''
    '''
    layer1 = ConvFoldPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    '''
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer0.output.flatten(2)
    #wenpeng2=theano.printing.Print('layer2_input')(layer2_input)
    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * embedding_size*kmax,
                         n_out=hidden_units, activation=T.tanh)

    dropout=dropout_from_layer(rng, layer2.output, dropout_p)               #dropout
    layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=task)
    
    #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2)
    # the cost we minimize during training is the NLL of the model
    
    #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum() +(layer0.W** 2).sum()+(embeddings**2).sum()
    cost = layer3.negative_log_likelihood(y)+L2_weight*L2_reg
    
    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x_index: indices_test_theano[index: index + batch_size],
                y: testY[index: index + batch_size],
                z: testLengths[index: index + batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x_index: indices_dev_theano[index: index + batch_size],
                y: devY[index: index + batch_size],
                z: devLengths[index: index + batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer0.params+[embeddings]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.

    '''   
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))
    
    '''
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))   

    
    
    train_model = theano.function([index], [cost,layer3.errors(y)], updates=updates,
          givens={
            x_index: indices_train_theano[index: index + batch_size],
            y: trainY[index: index + batch_size],
            z: trainLengths[index: index + batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            minibatch_index=minibatch_index+1
            
            cost_ij, error_ij = train_model(batch_start)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij)+' error: '+str(error_ij)
            if iter % validation_frequency == 0:

                # compute zero-one loss on validation set
                #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))


            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #37
0
    def __init__(self,
                 rng,
                 n_in=784,
                 n_hidden=[500, 500],
                 n_out=10,
                 lambda_reg=0.001,
                 alpha_reg=0.001):
        """This class is made to support a variable number of layers.
    
        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                   weights
    
        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_in: int
        :param n_in: dimension of the input to the DBN
    
        :type n_hidden: list of ints
        :param n_hidden: intermediate layers size, must contain
                               at least one value

        :type n_out: int
        :param n_out: dimension of the output of the network
       
        :type lambda_reg: float
        :param lambda_reg: paramter to control the sparsity of weights by l_1 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ).
         Thus, the larger lambda_reg is, the sparser the weights are.
        
        :type alpha_reg: float
        :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ),
         Thus, the smaller alpha_reg is, the smoother the weights are.
        """

        self.hidden_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(n_hidden)

        assert self.n_layers > 0

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data, each row is a sample
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_in
            else:
                input_size = n_hidden[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.hidden_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=n_hidden[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.hidden_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=rng,
                            theano_rng=None,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=n_hidden[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        if self.n_layers > 0:
            self.logRegressionLayer = LogisticRegression(
                input=self.hidden_layers[-1].output,
                n_in=n_hidden[-1],
                n_out=n_out)
        else:
            self.logRegressionLayer = LogisticRegression(input=self.x,
                                                         n_in=input_size,
                                                         n_out=n_out)

        self.params.extend(self.logRegressionLayer.params)

        # regularization
        L1s = []
        L2_sqrs = []
        for i in range(self.n_layers):
            L1s.append(abs(self.hidden_layers[i].W).sum())
            L2_sqrs.append((self.hidden_layers[i].W**2).sum())
        L1s.append(abs(self.logRegressionLayer.W).sum())
        L2_sqrs.append((self.logRegressionLayer.W**2).sum())
        self.L1 = T.sum(L1s)
        self.L2_sqr = T.sum(L2_sqrs)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost=self.negative_log_likelihood + \
        lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr +  alpha_reg*self.L1)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred
imageSize2 = (imageSize1 - W1.shape[2] + 1)/poolSize

# construct a fully-connected sigmoidal layer
n_in = nkerns[1] * imageSize2 * imageSize2

print n_in, "n_in"

upLayer2 = HiddenLayer(rng, input=upLayer2_input, n_in=nkerns[1] * imageSize2 * imageSize2,
                     n_out=n_out_layer2)
                                        
upLayer2.W.set_value(W2,borrow=True)
upLayer2.b.set_value(b2,borrow=True) 


upLayer3 = LogisticRegression(input=upLayer2.out(), n_in=n_out_layer2, n_out=7)

upLayer3.W.set_value(W3,borrow=True)
upLayer3.b.set_value(b3,borrow=True)

downLayer3 = LogisticRegression(input=upLayer3.out(), n_in=7, n_out=n_out_layer2)
#
##TODO: transpose or inverse
W3Down = numpy.transpose(W3)
W3Down = W3Down[::-1,::-1]

#
#
downLayer3.W.set_value(W3Down,borrow=True)
downLayer3.b.set_value(b3,borrow=True)
#
Beispiel #39
0
    def __init__(self,
                 rng,
                 input,
                 layer_sizes,
                 temperature,
                 activations,
                 use_bias=True):

        #rectified_linear_activation = lambda x: T.maximum(0.0, x)

        # Set up all the hidden layers
        weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:])
        self.layers = []
        self.dropout_layers = []
        next_layer_input = input
        #first_layer = True
        # dropout the input
        #next_dropout_layer_input = _bobakout_from_layer(rng, input, temperature)
        next_dropout_layer_input = input
        layer_counter = 0
        for n_in, n_out in weight_matrix_sizes[:-1]:
            #next_dropout_layer = DropoutHiddenLayer(rng=rng,
            next_dropout_layer = BobakoutHiddenLayer(
                rng=rng,
                input=next_dropout_layer_input,
                # activation=activations[layer_counter],
                activation=None,
                n_in=n_in,
                n_out=n_out,
                use_bias=use_bias,
                temperature=temperature)
            self.dropout_layers.append(next_dropout_layer)
            next_dropout_layer_input = next_dropout_layer.output

            # Reuse the paramters from the dropout layer here, in a different
            # path through the graph.
            next_layer = HiddenLayer(
                rng=rng,
                input=next_layer_input,
                activation=activations[layer_counter],
                # scale the weight matrix W with (1-p)
                W=next_dropout_layer.W,
                b=next_dropout_layer.b,
                n_in=n_in,
                n_out=n_out,
                use_bias=use_bias)
            self.layers.append(next_layer)
            next_layer_input = next_layer.output
            #first_layer = False
            layer_counter += 1

        # Set up the output layer
        n_in, n_out = weight_matrix_sizes[-1]
        dropout_output_layer = LogisticRegression(
            input=next_dropout_layer_input, n_in=n_in, n_out=n_out)
        self.dropout_layers.append(dropout_output_layer)

        # Again, reuse paramters in the dropout output.
        output_layer = LogisticRegression(
            input=next_layer_input,
            # scale the weight matrix W with (1-p)
            W=dropout_output_layer.W,
            b=dropout_output_layer.b,
            n_in=n_in,
            n_out=n_out)
        self.layers.append(output_layer)

        # Use the negative log likelihood of the logistic regression layer as
        # the objective.
        self.dropout_negative_log_likelihood = self.dropout_layers[
            -1].negative_log_likelihood
        self.dropout_errors = self.dropout_layers[-1].errors

        self.negative_log_likelihood = self.layers[-1].negative_log_likelihood
        self.errors = self.layers[-1].errors

        # Grab all the parameters together.
        self.params = [
            param for layer in self.dropout_layers for param in layer.params
        ]
Beispiel #40
0
class SdA(object):
    """Stacked denoising auto-encoder class (SdA)

    A stacked denoising autoencoder model is obtained by stacking several
    dAs. The hidden layer of the dA at layer `i` becomes the input of
    the dA at layer `i+1`. The first layer dA gets as input the input of
    the SdA, and the hidden layer of the last dA represents the output.
    Note that after pretraining, the SdA is dealt with as a normal MLP,
    the dAs are only used to initialize the weights.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
        self.recall = self.logLayer.recall(self.y)

    def pretraining_functions(self, train_set_x, batch_size):
        ''' Generates a list of functions, each of them implementing one
        step in training the dA corresponding to the layer with same index.
        The function will require as input the minibatch index, and to train
        a dA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared variable that contains all datapoints used
                            for training the dA

        :type batch_size: int
        :param batch_size: size of a [mini]batch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the dA layers
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            # get the cost and the updates list
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)
            # compile the theano function
            fn = theano.function(inputs=[index,
                              theano.Param(corruption_level, default=0.2),
                              theano.Param(learning_rate, default=0.1)],
                                 outputs=cost,
                                 updates=updates,
                                 givens={self.x: train_set_x[batch_begin:
                                                             batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on
        a batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

        train_fn = theano.function(inputs=[index],
              outputs=self.finetune_cost,
              updates=updates,
              givens={
                self.x: train_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size:
                                    (index + 1) * batch_size]},
              name='train')

        test_score_i = theano.function([index], self.errors,
                 givens={
                   self.x: test_set_x[index * batch_size:
                                      (index + 1) * batch_size],
                   self.y: test_set_y[index * batch_size:
                                      (index + 1) * batch_size]},
                      name='test')

        valid_score_i = theano.function([index], self.errors,
              givens={
                 self.x: valid_set_x[index * batch_size:
                                     (index + 1) * batch_size],
                 self.y: valid_set_y[index * batch_size:
                                     (index + 1) * batch_size]},
                      name='valid')

        test_recall_i = theano.function([index], self.recall,
                 givens={
                   self.x: test_set_x[index * batch_size:
                                      (index + 1) * batch_size],
                   self.y: test_set_y[index * batch_size:
                                      (index + 1) * batch_size]},
                      name='test_recall')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        # Create a function that scans the entire test set for recall
        def test_recall():
            return [test_recall_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score, test_recall
def perturb_random(params, shape, oldoutput, nkerns=[20, 50], batch_size=500):

    print '... building the model'
    rng = numpy.random.RandomState(23455)

    x = T.tensor4()
    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    f = theano.function(inputs=[x], outputs=[layer2.output, layer3.y_pred])

    layer3.W.set_value(params[-1][0])
    layer3.b.set_value(params[-1][1])
    layer2.W.set_value(params[-1][2])
    layer2.b.set_value(params[-1][3])
    layer1.W.set_value(params[-1][4])
    layer1.b.set_value(params[-1][5])
    layer0.W.set_value(params[-1][6])
    layer0.b.set_value(params[-1][7])

    #    perturb 500 shapes at each iteration, with ptimes iterations
    perturbed = numpy.tile(shape[0], (500, 1))
    oldoutputs = numpy.tile(oldoutput, (500, 1))
    label = shape[1]
    ptimes = 500
    imagelength = numpy.sqrt(numpy.sum(shape[0]**2))
    outputlength = numpy.sqrt(numpy.sum(oldoutput**2))
    p = []
    s = []
    for i in range(ptimes):
        print 'perturbing ' + str(i) + ' ......'
        perturbation = numpy.random.normal(0, 0.15, perturbed.shape)
        perturblength = numpy.sqrt(numpy.sum(perturbation**2, axis=1))
        shapes = perturbed + perturbation
        outputs, labels = f(shapes.reshape(500, 1, 28, 28))
        distances = numpy.sum((outputs - oldoutputs)**2, axis=1)
        pos = numpy.argmax(distances)
        print 'distance ' + str(numpy.sqrt(distances[pos]))
        pert = {}
        pert['perturbation'] = perturbation[pos]
        pert['plength'] = perturblength[pos]
        pert['ilength'] = imagelength
        pert['olength'] = outputlength
        pert['distance'] = numpy.sqrt(distances[pos])
        pert['output'] = outputs[pos]
        pert['label'] = labels[pos]
        p.append(pert)
        if len(numpy.nonzero(labels != label)[0]) != 0:
            print 'success!' + str(label) + ' '
            pos = numpy.nonzero(labels != label)[0][0]
            print labels[pos]
            pert = {}
            pert['perturbation'] = perturbation[pos]
            pert['plength'] = perturblength[pos]
            pert['ilength'] = imagelength
            pert['olength'] = outputlength
            pert['distance'] = numpy.sqrt(distances[pos])
            pert['output'] = outputs[pos]
            pert['label'] = labels[pos]
            s.append(pert)
    return p, s
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_wordnet_hyper_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_EVAlution_hyper_vs_all(
        maxSentLen, word2id)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    hidden_size[0],
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate(
        [attentive_sent_embeddings_l, attentive_sent_embeddings_r], axis=1)
    LR_input_size = 2 * hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para

    cost = loss

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Beispiel #43
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=10,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_noMT_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_labels, word2id = load_il9_NI_test(
        word2id, maxSentLen)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    test_labels = np.asarray(test_labels, dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-SWA-multicca.d300.SWA.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]

    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(930508)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.

    # Parameterizing
    n_feature = train_set_x.get_value().shape[1]
    matrix_dim = numpy.sqrt(n_feature)
    matrix_dim = matrix_dim.astype('int8')

    layer0_input = x.reshape((batch_size, 1, matrix_dim, matrix_dim))
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape = (batch_size, 1, matrix_dim, matrix_dim),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
    temp1 = (matrix_dim-5+1)/2
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape = (batch_size, nkerns[0],temp1,temp1),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    temp2 = (temp1-5+1)/2
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * temp2 * temp2,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1
    # print n_out
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995 # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # if iter % 10 == 0:
                # print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                '''print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))'''

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    '''print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))'''
                    print test_score * 100.

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    '''print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score *
          100.))a'''
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #45
0
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in range(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size, k):
        '''Generates a list of functions, for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used
                            for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k

        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None, k=k)

            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin:batch_end]
                }
            )
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage

        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x: train_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: train_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x: test_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: test_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x: valid_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: valid_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in range(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in range(n_test_batches)]

        return train_fn, valid_score, test_score
Beispiel #46
0
def evaluate_lenet5(datasets,
                    imgh,
                    imgw,
                    nclass,
                    learning_rate=0.01,
                    d=0.0003,
                    n_epochs=500,
                    nkerns=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :rtype : object
    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nk+++++++++++++++++++++++++++++++++erns: number of kernels on each layer
    """
    rng = numpy.random.RandomState(23455)

    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    # x = T.matrix('x')   # the data is presented as rasterized images
    x = T.tensor4('x')
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    # layer0_input = x.reshape((batch_size, 3, 60, 40))
    layer0_input = x.reshape((batch_size, 3, imgh, imgw))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (60-5+1 , 40-5+1) = (56, 36)
    # maxpooling reduces this further to (56/2, 36/2) = (28, 18)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 28, 18)
    #     image_shape=(batch_size, 3, 60, 40),

    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 3, imgh, imgw),
                                filter_shape=(nkerns[0], 3, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (28-5+1, 18-5+1) = (24, 14)
    # maxpooling reduces this further to (24/2, 14/2) = (12, 7)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 12, 7)
    #     image_shape=(batch_size, nkerns[0], 28, 18),

    lh1 = (imgh - 5 + 1) / 2
    lw1 = (imgw - 5 + 1) / 2

    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], lh1, lw1),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 12 * 7),
    # or (500, 50 * 12 * 7) = (500, 3360) with the default values.
    lh2 = (lh1 - 5 + 1) / 2
    lw2 = (lw1 - 5 + 1) / 2

    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * lh2 * lw2,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=nclass)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    # the following code is modified to suit with the small test set size
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # theano expression to decay the learning rate across epoch
    current_rate = theano.tensor.fscalar('current_rate')

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - current_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index, current_rate],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50  # look at least at this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    test_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_test_loss = numpy.inf
    learning_rate = numpy.float32(learning_rate)
    best_iter = 0
    start_time = time.clock()

    epoch = 0
    done_looping = False
    test_error = []

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        learning_rate = learning_rate / (1 + d * (epoch - 1))
        print "learning rate is %f" % learning_rate

        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index,
                                  numpy.float32(learning_rate))

            if (iter + 1) % test_frequency == 0:

                # compute zero-one loss on validation set
                test_losses = [test_model(i) for i in xrange(n_test_batches)]
                this_test_loss = numpy.mean(test_losses)

                test_error.append(this_test_loss)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_test_loss * 100.))

                # if we got the best test score until now
                if this_test_loss < best_test_loss:

                    #improve patience if loss improvement is good enough
                    if this_test_loss < best_test_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_test_loss = this_test_loss
                    best_iter = iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_test_loss * 100., best_iter + 1, best_test_loss * 100.))
    print 'The code ran for %.2fm' % ((end_time - start_time) / 60.)

    return params, test_error
Beispiel #47
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in range(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
class SdA(object):
    """栈式自编码类(SdA)
    栈式自编码模型是由若干dAs堆栈组成。第i层的dA的隐层变成第i+1层的输入。
    第一层dA的输入是SdA的输入,最后一层dA的输出的SdA的输出、预训练后,
    SdA的运行类似普通的MLP,dAs只是用来初始化权重。
    """
    def __init__(self,numpy_rng,theano_rng=None,n_ins=784,
                 hidden_layers_sizes=[500,500],n_outs=10,
                 corruption_levels=[0.1,0.1]):
        """
        该类可以构造可变层数的网络
        numpy_rng:numpy.random.RandomState  用于初始化权重的随机数

        theano_rng: theano.tensor.shared_randomstreams.RandomStreams
                    Theano随机生成数,如果,默认值为None, 则是由'rng'
                    生成的随机种子
        n_ins: int  SdA输入的维度
        hidden_layers_sizes: lists of ints 中间层的层数列表,最少一个元素
        n_out: int 网路输出量的维度
        corruption_levels: list of float 每一层的corruption level
        """

        self.sigmoid_layers=[]
        self.dA_layers=[]
        self.params=[]
        self.n_layers=len(hidden_layers_sizes)

        assert self.n_layers>0 #设定隐层数量大于0

        if not theano_rng:
            theano_rng=RandomStreams(numpy_rng.randint(2**30))

        #设定符号变量
        self.x=T.matrix('x') #栅格化的图像数据
        self.y=T.ivector('y') #由[int]型标签组成的一维向量

        #SdA是一个MLP,降噪自编码器共享中间层的权重向量。
        #首先将SdA构造为深层多感知器,然后构造每个sigmoid层。
        #同时,该层的降噪自编码器也会共享权重。
        #预训练过程是训练这些自编码器(同时也会改变多感知器的权重)
        #在微调过程,通过在MLP上采用随机梯度下降法完成SdA的训练

        #构造sigmoid层
        for i in xrange(self.n_layers):
            #输入量的大小是下层隐层单元数量(本层不是第一层)
            #输入量的大小是输入量的大小(本层是第一层)
            if i==0:
                input_size=n_ins
            else:
                input_size=hidden_layers_sizes[i-1]

            #本层的输入是下层隐层的激活(本层不是第一层);
            #本层的输入是SdA的输入(本层是第一层)
            if i==0:
                layer_input=self.x
            else:
                layer_input=self.sigmoid_layers[-1].output

            #定义sigmoid层
            sigmoid_layer=HiddenLayer(rng=numpy_rng,
                                      input=layer_input,
                                      n_in=input_size,
                                      n_out=hidden_layers_sizes[i],
                                      activation=T.nnet.sigmoid)
            #将sigmoid层添加到层列表
            self.sigmoid_layers.append(sigmoid_layer)
            #这是个哲学问题...
            #但是我们只想说sigmoid_layers的参数就是
            #SdA的参数
            #dA中可视偏置是dA的参数,而不是SdA的参数
            self.params.extend(sigmoid_layer.params)

            #构造降噪自编码器与该层共享权重
            dA_layer=dA(numpy_rng=numpy_rng,
                        theano_rng=theano_rng,
                        input=layer_input,
                        n_visible=input_size,
                        n_hidden=hidden_layers_sizes[i],
                        W=sigmoid_layer.W,
                        bhid=sigmoid_layer.b
                        )
            self.dA_layers.append(dA_layer)

        #在MLP顶部加上losgistic层
        self.logLayer=LogisticRegression(
                        input=self.sigmoid_layers[-1].output,
                        n_in=hidden_layers_sizes[-1],n_out=n_outs)
        self.params.extend(self.logLayer.params)
        #建立函数,执行一步微调
        #定义第二步训练的代价:负log函数
        self.finetune_cost=self.logLayer.negative_log_likelihood(self.y)
        #分别对模型中参数计算梯度
        #给定self.x和self.y,定义每个minibatch上的误差的符号变量
        self.errors=self.logLayer.errors(self.y)

    def pretraining_function(self,train_set_x,batch_size):
        '''
        生成函数列表,每个函数执行一层中dA的训练,返回预训练的函数列表
        函数输入是minibatch的索引,在所有的minibatch执行相同的训练
        train_set_x: theano.tensor.TensorType   训练dA的数据点(共享变量)
        batch_size: int  [mini]batch大小

        '''
        #[mini]batch的索引
        index=T.lscalar('index')
        corruption_level=T.scalar('corruption') #corruption百分比
        learning_rate=T.scalar('lr') #学习率
        #batch数量
        n_bathes=train_set_x.get_value(borrow=True).shape[0]/batch_size
        #给定index后,起始的
        # batch
        batch_begin=index*batch_size
        #给定index后,结束的batch
        batch_end=batch_begin+batch_size

        pretrain_fns=[]
        for dA in self.dA_layers: #遍历dA
            #创建代价列表和更新列表
            cost,updates=dA.get_cost_updates(corruption_level,
                                            learning_rate)
            #创建theano函数
            fn=theano.function(inputs=[index,
                            theano.Param(corruption_level,default=0.2),
                            theano.Param(learning_rate,default=0.1)],
                                outputs=cost,
                                updates=updates,
                                givens={self.x:train_set_x[batch_begin:
                                                           batch_end]})
            #将fn添加到函数列表
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self,datasets,batch_size,learning_rate):
        '''
        创建"train"函数执行一步微调;"validate"函数计算验证集合中batch的误差;
        "test"函数计算测试集合中batch误差

        :param datasets: list of pairs of theano.tensor.TensorType
                         #包含所有datasets的列表,每3个元素为一个组:
                         依次为'train'、'valid'、'test'。每个元素又
                         包含两个theano变量:数据特征和标签
        :param batch_size: int  minibatch的大小
        :param learning_rate:float  微调阶段的learning_rate
        :return:
        '''

        (train_set_x,train_set_y)=datasets[0]
        (valid_set_x,valid_set_y)=datasets[1]
        (test_set_x,test_set_y)=datasets[2]

        #分别计算training、validation、testing的minibatch的数量
        n_valid_batches=valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches/=batch_size
        n_test_batches=test_set_x.get_value(borrow=True).shape[0]
        n_test_batches/=batch_size

        index=T.lscalar('index') #[mini]batch的索引
        #分别计算模型参数的梯度
        gparams=T.grad(self.finetune_cost,self.params)
        #计算微调更新参数列表
        updates=[]
        for param,gparam in zip(self.params,gparams):
            updates.append((param,param-gparam*learning_rate))

        train_fn=theano.function(inputs=[index],
                                 outputs=self.finetune_cost,
                                 updates=updates,
                                 givens={
                                     self.x:train_set_x[index*batch_size:
                                                        (index+1)*batch_size],
                                     self.y:train_set_y[index*batch_size:
                                                        (index+1)*batch_size]},
                                 name='train')
        test_score_i=theano.function([index],self.errors,
                                     givens={
                                         self.x:test_set_x[index*batch_size:
                                                         (index+1)*batch_size],
                                         self.y:test_set_y[index*batch_size:
                                                        (index+1)*batch_size]},
                                     name='test')
        valid_score_i=theano.function([index],self.errors,
                                      givens={
                                          self.x:valid_set_x[index*batch_size:
                                                            (index+1)*batch_size],
                                          self.y:valid_set_y[index*batch_size:
                                                            (index+1)*batch_size]},
                                      name='valid')
        #创建函数遍历整个验证集合
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches) ]
        #创建函数遍历整个测试集合
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]
        return train_fn, valid_score, test_score
Beispiel #49
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset=DataSet,
                    nkerns=[cls1, cls2],
                    batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print(type(train_set_x))

    #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540])
    #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540])
    #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540])

    #train_set_x = train_set_x / 100
    #valid_set_x = valid_set_x / 100
    #test_set_x = test_set_x / 100

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    #n_test_batches = (n_test_batches/batch_size) + (n_test_batches % batch_size > 0)

    print(n_test_batches)
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    Alr = T.scalar('Alr')
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (nFB, nFs)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    dFeatureV = iFMs * nFB * nFs
    xinp = x[:, :dFeatureV]

    #    print (x.shahpe)

    layer0_input = xinp.reshape((batch_size, iFMs, nFB, nFs))
    layer1H_input = x[:, dFeatureV:]
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, iFMs, nFB, nFs),
                                filter_shape=(nkerns[0], iFMs, fsx, fsy),
                                poolsize=(p, p))
    cl2x = (nFB - fsx + 1) / p
    cl2y = (nFs - fsy + 1) / p
    layer1H = HiddenLayer(rng,
                          input=layer1H_input,
                          n_in=27,
                          n_out=nhu1 / 4,
                          activation=T.tanh)
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)

    #layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
    #        image_shape=(batch_size, nkerns[0], cl2x, cl2y),
    #        filter_shape=(nkerns[1], nkerns[0], fsx, 1), poolsize=(p2, 1))
    #hl1 = (cl2x - fsx + 1)/p2
    hl1 = cl2x * cl2y
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer0.output.flatten(2)
    #layer2_inputT = T.concatenate([layer2_input,x[:,dFeatureV:]],axis = 1)
    layer2_inputT = T.concatenate([layer2_input, layer1H.output], axis=1)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_inputT,
                         n_in=(nkerns[0] * hl1 * 1) + nhu1 / 4,
                         n_out=nhu1 * 2,
                         activation=T.tanh)

    layer22 = HiddenLayer(rng,
                          input=layer2.output,
                          n_in=nhu1 * 2,
                          n_out=nhu1,
                          activation=T.tanh)

    layer23 = HiddenLayer(rng,
                          input=layer22.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    layer23 = HiddenLayer(rng,
                          input=layer22.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    layer24 = HiddenLayer(rng,
                          input=layer23.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    layer25 = HiddenLayer(rng,
                          input=layer24.output,
                          n_in=nhu1,
                          n_out=nhu1,
                          activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer25.output, n_in=nhu1, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    #yPred = layer3.ypred(layer2.output)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index], [layer3.errors(y), layer3.y_pred],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    #params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params
    params = layer3.params + layer25.params + layer24.params + layer23.params + layer22.params + layer2.params + layer1H.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        #updates.append((param_i, param_i - learning_rate * grad_i))
        updates.append((param_i, param_i - Alr * grad_i))

    train_model = theano.function(
        [index, Alr],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size][:],
            y: train_set_y[index * batch_size:(index + 1) * batch_size][:]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    #best_params = None
    best_params = []
    best_validation_loss = numpy.inf
    prev_validation_loss = 200

    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    Alrc = 0.1
    AlrE = 0.00001
    epochC = 0
    epoch = 0
    done_looping = False
    for param in params:
        best_params.append(param.get_value())
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        epochC = epochC + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, Alrc)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                lossratio = (this_validation_loss -
                             prev_validation_loss) / (prev_validation_loss + 1)
                print(lossratio)
                print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100., Alrc))

                # if we got the best validation score until now
                #if this_validation_loss < best_validation_loss:
                if lossratio <= 0.0:
                    for i in range(len(params)):
                        best_params[i] = params[i].get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    prev_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #tm =  test_model(0)

                    yP = numpy.asarray([])
                    test_losses = [
                        test_model(i)[0] for i in xrange(n_test_batches)
                    ]
                    for i in xrange(n_test_batches):
                        yP = numpy.concatenate((yP, test_model(i)[1]))
                    print(yP.shape)
                    test_score = numpy.mean(test_losses)

                    #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value())
                    #y = test_set_y.owner.inputs[0].get_value()[:2300]
                    y = yP

                    print(yP.shape)
                    print(y.shape)
                    I1 = numpy.nonzero(y == 0.0)
                    I2 = numpy.nonzero(y == 1.0)
                    I3 = numpy.nonzero(y == 2.0)
                    I4 = numpy.nonzero(y == 3.0)

                    print(I1[0].shape)
                    print(I2[0].shape)
                    print(I3[0].shape)
                    print(I4[0].shape)
                    I11 = numpy.nonzero(yP[I1[0]] == 0)
                    I12 = numpy.nonzero(yP[I1[0]] == 1)
                    I13 = numpy.nonzero(yP[I1[0]] == 2)
                    I14 = numpy.nonzero(yP[I1[0]] == 3)
                    I21 = numpy.nonzero(yP[I2[0]] == 0)
                    I22 = numpy.nonzero(yP[I2[0]] == 1)
                    I23 = numpy.nonzero(yP[I2[0]] == 2)
                    I24 = numpy.nonzero(yP[I2[0]] == 3)
                    I31 = numpy.nonzero(yP[I3[0]] == 0)
                    I32 = numpy.nonzero(yP[I3[0]] == 1)
                    I33 = numpy.nonzero(yP[I3[0]] == 2)
                    I34 = numpy.nonzero(yP[I3[0]] == 3)
                    I41 = numpy.nonzero(yP[I4[0]] == 0)
                    I42 = numpy.nonzero(yP[I4[0]] == 1)
                    I43 = numpy.nonzero(yP[I4[0]] == 2)
                    I44 = numpy.nonzero(yP[I4[0]] == 3)

                    acc1 = 100  #float(float(I11[0].size)/float(I1[0].size))
                    acc2 = 100  #float(float(I22[0].size)/float(I2[0].size))
                    if n_out == 3:
                        acc3 = 100  #float(float(I33[0].size)/float(I3[0].size))
                        acc4 = 0
                    elif n_out == 4:
                        acc3 = float(float(I33[0].size) / float(I3[0].size))
                        acc4 = float(float(I44[0].size) / float(I4[0].size))
                    else:
                        acc3 = 0
                        acc4 = 0
                    print((
                        '     epoch %i, minibatch %i/%i, test error of '
                        'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, acc4 = %f, I11 = %i, I12 = %i, I13 = %i, I14 = %i, I21 = %i, I22 = %i, I23 = %i, I24 = %i, I31 = %i, I32 = %i, I33 = %i, I34 = %i, I41 = %i, I42 = %i, I43 = %i, I44 = %i %%'
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100., acc1 * 100., acc2 * 100.,
                         acc3 * 100, acc4 * 100, I11[0].size, I12[0].size,
                         I13[0].size, I14[0].size, I21[0].size, I22[0].size,
                         I23[0].size, I24[0].size, I31[0].size, I32[0].size,
                         I33[0].size, I34[0].size, I41[0].size, I42[0].size,
                         I43[0].size, I44[0].size))

                    #print(('     epoch %i, minibatch %i/%i, test error of best '
                    #       'model %f %%') %
                    #      (epoch, minibatch_index + 1, n_train_batches,
                    #       test_score * 100.))
                else:
                    if Alrc <= AlrE:
                        done_looping = True
                        break
                    elif epochC > 40:
                        Alrc = Alrc / 2
                        for param, best_param in zip(params, best_params):
                            param.set_value(best_param)
                        epochC = 0
            #if patience <= iter:
            #    done_looping = True
            #    break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    #print >> sys.stderr, ('The code for file ' +
    #                      os.path.split(__file__)[1] +
    #                      ' ran for %.2fm' % ((end_time - start_time) / 60.))
    OF = open(outFile, 'a')
    print(5,
          5,
          5,
          5,
          5,
          DataSet,
          n_out,
          fsx,
          fsy,
          p,
          cls1,
          cls2,
          nhu1,
          nFB,
          nFs,
          iFMs,
          nhus,
          batch_size,
          test_score * 100.,
          acc1 * 100.,
          acc2 * 100.,
          acc3 * 100,
          acc4 * 100,
          I11[0].size,
          I12[0].size,
          I13[0].size,
          I14[0].size,
          I21[0].size,
          I22[0].size,
          I23[0].size,
          I24[0].size,
          I31[0].size,
          I32[0].size,
          I33[0].size,
          I34[0].size,
          I41[0].size,
          I42[0].size,
          I43[0].size,
          I44[0].size,
          file=OF)

    OF.close()
def perturb_bfgs(perturbation,
                 params,
                 shape,
                 oldoutput,
                 c=1,
                 nkerns=[20, 50],
                 batch_size=1):

    #print '... building the model'
    rng = numpy.random.RandomState(23455)

    x = T.tensor4()
    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3)

    f = theano.function(inputs=[x], outputs=[layer2.output, layer3.y_pred])

    layer3.W.set_value(params[-1][0])
    layer3.b.set_value(params[-1][1])
    layer2.W.set_value(params[-1][2])
    layer2.b.set_value(params[-1][3])
    layer1.W.set_value(params[-1][4])
    layer1.b.set_value(params[-1][5])
    layer0.W.set_value(params[-1][6])
    layer0.b.set_value(params[-1][7])

    perturbed = shape
    oldoutputs = oldoutput
    distances = 0
    perturblength = numpy.sqrt(numpy.sum(perturbation**2))
    shapes = perturbed + perturbation
    outputs, labels = f(shapes.reshape(1, 1, 28, 28))
    print labels
    for o in oldoutputs:
        distances += numpy.sqrt(numpy.sum((outputs - o)**2))
    distances /= len(oldoutputs)
    return c * perturblength + distances
Beispiel #51
0
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size, k):
        '''Generates a list of functions, for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used
                            for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k

        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None, k=k)

            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x[batch_begin:batch_end]
                }
            )
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage

        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x: train_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: train_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x: test_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: test_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x: valid_set_x[
                    index * batch_size: (index + 1) * batch_size
                ],
                self.y: valid_set_y[
                    index * batch_size: (index + 1) * batch_size
                ]
            }
        )

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
Beispiel #52
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50],
                    batch_size=50):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    # datasets = input_data_LIDC.load_data(
    #     "../data/LIDC/resized_images-236x236/LIDC-IDRI-0003/",    # "../data/LIDC/resized_images/"
    #     "*.tiff",
    #     '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv',
    #     '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/',
    #     "imageSOP_UID-filePath-dicominfo-LIDC-IDRI-0003.csv")   # "*.csv"
    # # img_px_len_x = 32 # 236
    # # img_px_len_y = 32 # 236
    # img_px_len_x = 236
    # img_px_len_y = 236
    # lidc_n_out_malignancy = 5

    # pickle_file_name = "Evans-MacBook-Pro.local-resized_images-236x236-first30.theano.pickle"
    # if os.path.isfile(pickle_file_name):
    # input_data_LIDC.esprint("Unpickling: " + pickle_file_name)
    # with open(pickle_file_name, "rb") as pickle_file:
    #   datasets = pickle.load(pickle_file)
    # else:
    #     datasets = input_data_LIDC.load_data(
    #         "../data/LIDC/resized_images-236x236-first30/",    # "../data/LIDC/resized_images/"
    #         "*.tiff",
    #         '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv',
    #         '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/',
    #         "*.csv")   # "*.csv"
    # input_data_LIDC.esprint("Pickling: " + pickle_file_name)
    # with open(pickle_file_name, "wb") as pickle_file:
    #     pickle.dump(datasets, pickle_file)
    # img_px_len_x = 32 # 236
    # img_px_len_y = 32 # 236
    # # img_px_len_x = 236
    # # img_px_len_y = 236
    # lidc_n_out_malignancy = 5

    pickle_file_name = "Evans-MacBook-Pro.local-resized_images-32x32.theano.pickle"
    if os.path.isfile(pickle_file_name):
        input_data_LIDC.esprint("Unpickling: " + pickle_file_name)
        with open(pickle_file_name, "rb") as pickle_file:
            datasets = pickle.load(pickle_file)
    else:
        datasets = input_data_LIDC.load_data(
            "../data/LIDC/resized_images-32x32/",  # "../data/LIDC/resized_images/"
            "*.tiff",
            '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/master_join4.csv',
            '/Users/estory/Documents/syncable/School/DePaul/research/LIDC_Complete_20141106/Extracts/DICOM_metadata_extracts/',
            "*.csv")  # "*.csv"
        input_data_LIDC.esprint("Pickling: " + pickle_file_name)
        with open(pickle_file_name, "wb") as pickle_file:
            pickle.dump(datasets, pickle_file)
    img_px_len_x = 32  # 236
    img_px_len_y = 32  # 236
    # img_px_len_x = 236
    # img_px_len_y = 236
    lidc_n_out_malignancy = 5

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # train_set_y.flatten()
    # valid_set_y.flatten()
    # test_set_y.flatten()

    # input_data_LIDC.esprint("type: " + str(type(test_set_y)))
    # input_data_LIDC.esprint("test_set_x.shape: " + str(test_set_x.shape))
    # input_data_LIDC.esprint("test_set_y.shape: " + str(test_set_y.shape))
    # input_data_LIDC.esprint(theano.printing.Print('test_set_y')(test_set_y))

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    input_data_LIDC.esprint("n_train_batches = " + str(n_train_batches))
    input_data_LIDC.esprint("n_valid_batches = " + str(n_valid_batches))
    input_data_LIDC.esprint("n_test_batches = " + str(n_test_batches))

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    theano.pp(x)
    theano.pp(y)
    # input_data_LIDC.esprint("Set lengths" + str([len(train_set_x), len(train_set_y), len(valid_set_x), len(valid_set_y), len(test_set_x), len(test_set_y)]))
    # input_data_LIDC.esprint("Set lengths: " + str(len(x) + ", " + str(len(y))))

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, img_px_len_x, img_px_len_y))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, img_px_len_x,
                                             img_px_len_y),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=batch_size,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output,
                                n_in=batch_size,
                                n_out=lidc_n_out_malignancy)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #53
0
    def __init__(
        self,
        W_sda=[None,None],#有已保存的参数的话就读入
        b_sda=[None,None],
        nn_structure=[600,500,500,10], # 输入层、隐藏层各层节点数、输出类别数
        dropout_rates=[0.5,0.5],
        L1_reg=0,L2_reg=0,
        activation=T.nnet.sigmoid,
        non_static=False,
        wordvec=None
    ):
        #print activation
        self.activation=activation
        self.sigmoid_layers = [] #隐藏层列表...
        self.dA_layers = [] #da层列表...
        self.params = [] #不包括dA层....sigmoid layers...
        self.n_layers = len(nn_structure)-2 #有几个隐藏层
        
        #random num
        numpy_rng = numpy.random.RandomState(89677)
        theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.matrix('x')#data
        self.y = T.ivector('y')#label
        #print wordvec
        self.Words=theano.shared(value=wordvec,name='Words')
        layer0_input= self.Words[T.cast(self.x.flatten(),dtype="int32")].\
                      reshape((self.x.shape[0],self.x.shape[1]*self.Words.shape[1]))
        #n=layer0_input.shape[1] / numpy.shape(wordvec)[0]
        '''
        #词向量相加...
        avg_vec = numpy.zeros(numpy.shape(wordvec)[0],dtype=theano.config.floatX)
        for i in xrange(5):
            sub_vec=layer0_input[:,i*200:(i+1)*200]
            avg_vec=T.add(sub_vec,avg_vec)
        #avg_vec=avg_vec/5.
        layer0_input=avg_vec
        '''
        # DA层和MLP层共享权重..
        for i in xrange(self.n_layers):
            if i == 0:
                input_size = nn_structure[0] #input_size:每个隐藏层的输入节点个数
            else:
                input_size = nn_structure[i]

            if i == 0:
                layer_input = layer0_input#self.x #输入数据
            else:
                layer_input = self.sigmoid_layers[i-1].output # 后一层的输入是前一层的输出
                
            #参数W\b随机初始化..dropout layer
            sigmoid_layer = dpLayer(rng=numpy_rng,
                                    input=layer_input,
                                    n_in=input_size,
                                    W=W_sda[i],
                                    b=b_sda[i],
                                    dropout_rate=dropout_rates[i],
                                    n_out=nn_structure[i+1],
                                    activation=self.activation)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer) #隐藏层列表

            self.params.extend(sigmoid_layer.params) #mlp参数w,b

            # 与hidden layer共享权重
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=nn_structure[i+1],
                          W=sigmoid_layer.W, # share weights with sigmoid layer
                                             # W是共享变量,dA预训练
                          bhid=sigmoid_layer.b,
                          activation=self.activation)
            self.dA_layers.append(dA_layer)

        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=nn_structure[-2],
            n_out=nn_structure[-1]
        )

        self.params.extend(self.logLayer.params)#log参数 w,b
    

        L1=0.
        L2=0.
        for layer in self.sigmoid_layers:
            L1+=abs(layer.W).sum()
            L2+=(layer.W ** 2).sum()
            
        self.L1=(L1+abs(self.logLayer.W).sum())
        self.L2_sqr=L2+(self.logLayer.W ** 2).sum()
        
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) \
                            + L1_reg * self.L1 + L2_reg * self.L2_sqr 
                            
        if non_static:
            self.params.extend([self.Words])

        self.errors = self.logLayer.errors(self.y)
Beispiel #54
0
class DeepAutoencoder(object):

    def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
                 hidden_layers_sizes = [50,60], n_outs = 10,
                 reconstruction_cost = 'cross_entropy',\
                 supervised_training = 'russ',
                 tied_weights = False):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.reconstruction_layers = []
        self.rbm_layers     = []
        self.params         = []
        self.n_layers       = len(hidden_layers_sizes)
        self.tied_weights   = tied_weights

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x  = T.matrix('x')  # the data is presented as rasterized images
        self.y  = T.ivector('y') # the labels are presented as 1D vector of
                                 # [int] labels

        # The DBN is an MLP, for which all weights of intermediate layers are shared with a
        # different RBM.  We will first construct the DBN as a deep multilayer perceptron, and
        # when constructing each sigmoidal layer we also construct an RBM that shares weights
        # with that layer. During pretraining we will train these RBMs (which will lead
        # to chainging the weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the MLP.

        for i in xrange( self.n_layers ):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of the layer below or
            # the input size if we are on the first layer
            if i == 0 :
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i-1]

            # the input to this layer is either the activation of the hidden layer below or the
            # input of the DBN if you are on the first layer
            if i == 0 :
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            # TODO(dumitru): this is temporary, get rid of this when done
            if i != self.n_layers-1:
               activation = T.nnet.sigmoid
            else:
               activation = None

            sigmoid_layer = HiddenLayer(rng   = numpy_rng,
                                           input = layer_input,
                                           n_in  = input_size,
                                           n_out = hidden_layers_sizes[i],
                                           activation = activation)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are going to only declare that
            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng,
                          input = layer_input,
                          n_visible = input_size,
                          n_hidden  = hidden_layers_sizes[i],
                          W = sigmoid_layer.W,
                          hbias = sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # Creating the reconstruction layers
        for i in xrange(self.n_layers,0,-1):

            if i == self.n_layers:
                layer_input = self.sigmoid_layers[-1].output
            else:
                layer_input = self.reconstruction_layers[-1].output

            if self.tied_weights:
               W = self.sigmoid_layers[i-1].W.T
               b = self.rbm_layers[i-1].vbias
            else:
               W = b = None

            # the output size if we are on the first layer
            if i == 1 :
                output_size = n_ins
                activation = None
            else:
                output_size = hidden_layers_sizes[i-2]
                activation = T.nnet.sigmoid

            input_size = hidden_layers_sizes[i-1]

            reconstruction_layer = HiddenLayer(numpy_rng, layer_input, input_size, output_size, W, b, activation)

            if not self.tied_weights:
               self.params.extend(reconstruction_layer.params)

            # add the layer to our list of layers
            self.reconstruction_layers.append(reconstruction_layer)

        self.xhat = T.nnet.sigmoid( self.reconstruction_layers[-1].output )

        # otherwise we'll end up with a bunch of extra params and theano will complain
        self.reconstruction_params = list(self.params)

        self.global_cross_entropy =  T.mean(-T.sum( self.x*T.log(self.xhat) + (1-self.x)*T.log(1-self.xhat), axis=1 ))
        self.global_mse = T.mean(T.sum((self.x - self.xhat)**2.0,axis=1))

        if reconstruction_cost == 'cross_entropy':
           self.global_pretraining_cost = self.global_cross_entropy
        elif reconstruction_cost == 'mse':
            self.global_pretraining_cost = self.global_mse
        else:
            raise NotImplementedError('Invalid reconstruction error\
                                      specified')


        if supervised_training == 'standard':
           # We now need to add a logistic layer on top of the MLP
           self.logLayer = LogisticRegression(\
                         input = self.sigmoid_layers[-1].output,\
                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
           self.params.extend(self.logLayer.params)

           # compute the cost for second phase of training, defined as the
           # negative log likelihood of the logistic regression (output) layer
           self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

           # compute the gradients with respect to the model parameters
           # symbolic variable that points to the number of errors made on the
           # minibatch given by self.x and self.y
           self.errors = self.logLayer.errors(self.y)

           # compute prediction as class whose probability is maximal in
           # symbolic form
           self.y_pred=T.argmax(self.p_y_given_x, axis=1)


        elif supervised_training == 'russ':

           # compute vector of class-membership probabilities in symbolic form
           p_y_given_x = T.nnet.softmax(self.sigmoid_layers[-1].output)
           self.y_pred=T.argmax(p_y_given_x, axis=1)
           self.finetune_cost = -T.mean(T.log(p_y_given_x[T.arange(self.y.shape[0]),self.y]))
           self.errors = T.mean(T.neq(self.y_pred, self.y))

        else:
           print 'Unsupport supervised training method', supervised_training
           raise NotImplementedError

    def build_pretraining_functions(self, train_set_x, batch_size,k):
        ''' Generates a list of functions, for performing one step of gradient descent at a
        given layer. The function will require as input the minibatch index, and to train an
        RBM you just need to iterate, calling the corresponding function on all minibatch
        indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k
        '''

        # index to a [mini]batch
        index            = T.lscalar('index')   # index to a minibatch
        learning_rate    = T.scalar('lr')    # learning rate to use
        weight_decay     = T.scalar('weight_decay')    # learning rate to use


        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin+batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost,updates = rbm.get_cost_updates(learning_rate, persistent=None, k = k, weight_decay = weight_decay)

            # compile the theano function
            fn = theano.function(inputs = [index,
                              theano.Param(learning_rate, default = 0.1),
                              theano.Param(weight_decay, default = 0.0002)],
                    outputs = cost,
                    updates = updates,
                    givens  = {self.x :train_set_x[batch_begin:batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_global_pretraining_functions_sgd(self, datasets, train_batch_size, learning_rate):

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x , test_set_y ) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / train_batch_size
        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / train_batch_size

        index   = T.lscalar('index')    # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.global_pretraining_cost, self.reconstruction_params)

        # compute list of updates
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam*learning_rate

        costs = [self.global_pretraining_cost,self.global_mse]

        train_fn = build_score_fn([index],index,costs,train_set_x,train_set_y,train_batch_size,updates,self)
        test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,train_batch_size,[],self)
        valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,train_batch_size, [],self)

        # Create a function that scans the entire validation set
        def valid_fn():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_fn():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_fn, test_fn


    def build_finetune_functions(self, datasets, train_batch_size, learning_rate):
        '''Generates a function `train` that implements one step of finetuning, a function
        `validate` that computes the error on a batch from the validation set, and a function
        `test` that computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;  the has to contain three
        pairs, `train`, `valid`, `test` in this order, where each pair is formed of two Theano
        variables, one for the datapoints, the other for the labels
        :type train_batch_size: int
        :param train_batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x , test_set_y ) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / train_batch_size
        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / train_batch_size

        index   = T.lscalar('index')    # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = {}
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam*learning_rate

        costs = self.finetune_cost

        train_fn = build_score_fn([index],index,costs,train_set_x,train_set_y,train_batch_size,updates,self)
        test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,train_batch_size,[],self)
        valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,train_batch_size,[],self)

        # Create a function that scans the entire validation set
        def valid_fn():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_fn():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_fn, test_fn

    def build_global_pretraining_functions_hf(self, datasets, train_batch_size,
                                                 preconditioner, ridge, maxiterations):
        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x , test_set_y ) = datasets[2]

        # compute number of minibatches for training, validation and testing
        index   = T.lscalar('index')    # index to a [mini]batch

        valid_batch_size = valid_set_x.shape[0]
        test_batch_size = test_set_x.shape[0]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / valid_batch_size
        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / test_batch_size
        n_train_batches = train_set_x.get_value(borrow=True).shape[0] / train_batch_size

        costs = [self.global_pretraining_cost,self.global_mse]

        def train_fn():

            givens = {
                self.x:train_set_x[index*train_batch_size:(index+1)*train_batch_size],
                self.y:train_set_y[index*train_batch_size:(index+1)*train_batch_size]}

            error = truncated_newton([index], self.reconstruction_layers[-1].output,
                                     costs, self.reconstruction_params, givens,
                                     maxiterations, ridge, preconditioner, n_train_batches)
            return error


        test_score_i = build_score_fn([index],index,costs,test_set_x,test_set_y,valid_batch_size,[],self)
        valid_score_i = build_score_fn([index],index,costs,valid_set_x,valid_set_y,test_batch_size,[],self)

        def valid_fn():
             return [valid_score_i(i) for i in range(n_valid_batches)]

        def test_fn():
             return [test_score_i(i) for i in range(n_test_batches)]

        return train_fn, valid_fn, test_fn

    def initialize_reconstruction_weights(self):
        if not self.tied_weights:
            updates = []
            for i in xrange(self.n_layers,0,-1):
               updates.append((self.reconstruction_layers[self.n_layers-i].W, self.sigmoid_layers[i-1].W.T))
               updates.append((self.reconstruction_layers[self.n_layers-i].b, self.rbm_layers[i-1].vbias))

            f = theano.function([],[],updates = updates)
            f()

    def save_rbm_weights(self, filename):
        param_list = []
        for i in range(self.n_layers):
           params = (self.rbm_layers[i].W.get_value(), self.rbm_layers[i].vbias.get_value(), self.rbm_layers[i].hbias.get_value())
           param_list.append(params)
        import cPickle
        cPickle.dump(param_list,open(filename,'w'), -1)

    def load_rbm_weights(self, filename):
        import cPickle
        param_list = cPickle.load(open(filename,'r'))
        for i in range(self.n_layers):
           W,vbias,hbias = param_list[i]
           self.rbm_layers[i].W.set_value(numpy.array(W,dtype=theano.config.floatX))
           self.rbm_layers[i].vbias.set_value(numpy.array(vbias,dtype=theano.config.floatX))
           self.rbm_layers[i].hbias.set_value(numpy.array(hbias,dtype=theano.config.floatX))
#layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
#            image_shape=(batch_size, nkerns[0], 12, 12),
#            filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))

## the HiddenLayer being fully-connected, it operates on 2D matrices of
## shape (batch_size,num_pixels) (i.e matrix of rasterized images).
## This will generate a matrix of shape (20,32*4*4) = (20,512)
#layer2_input = layer1.output.flatten(2)
layer2_input = layer0.output.flatten(2)

# construct a fully-connected sigmoidal layer
layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * vectorsize * 1,
                         n_out=500, activation=T.tanh)

# classify the values of the fully-connected sigmoidal layer
layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

# the cost we minimize during training is the NLL of the model
cost = layer3.negative_log_likelihood(y)

# create a function to compute the mistakes that are made by the model
test_model = theano.function([index], layer3.errors(y),
         givens={
             x: test_set_x[index * batch_size: (index + 1) * batch_size],
             y: test_set_y[index * batch_size: (index + 1) * batch_size]})

test_model_confidence = theano.function([index], layer3.results(y),
         givens={
             x: test_set_x[index * batch_size: (index + 1) * batch_size]})

# load parameters
Beispiel #56
0
    def __init__(self, numpy_rng, theano_rng = None, n_ins = 784,
                 hidden_layers_sizes = [50,60], n_outs = 10,
                 reconstruction_cost = 'cross_entropy',\
                 supervised_training = 'russ',
                 tied_weights = False):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.reconstruction_layers = []
        self.rbm_layers     = []
        self.params         = []
        self.n_layers       = len(hidden_layers_sizes)
        self.tied_weights   = tied_weights

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x  = T.matrix('x')  # the data is presented as rasterized images
        self.y  = T.ivector('y') # the labels are presented as 1D vector of
                                 # [int] labels

        # The DBN is an MLP, for which all weights of intermediate layers are shared with a
        # different RBM.  We will first construct the DBN as a deep multilayer perceptron, and
        # when constructing each sigmoidal layer we also construct an RBM that shares weights
        # with that layer. During pretraining we will train these RBMs (which will lead
        # to chainging the weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the MLP.

        for i in xrange( self.n_layers ):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of the layer below or
            # the input size if we are on the first layer
            if i == 0 :
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i-1]

            # the input to this layer is either the activation of the hidden layer below or the
            # input of the DBN if you are on the first layer
            if i == 0 :
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            # TODO(dumitru): this is temporary, get rid of this when done
            if i != self.n_layers-1:
               activation = T.nnet.sigmoid
            else:
               activation = None

            sigmoid_layer = HiddenLayer(rng   = numpy_rng,
                                           input = layer_input,
                                           n_in  = input_size,
                                           n_out = hidden_layers_sizes[i],
                                           activation = activation)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are going to only declare that
            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng = numpy_rng, theano_rng = theano_rng,
                          input = layer_input,
                          n_visible = input_size,
                          n_hidden  = hidden_layers_sizes[i],
                          W = sigmoid_layer.W,
                          hbias = sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # Creating the reconstruction layers
        for i in xrange(self.n_layers,0,-1):

            if i == self.n_layers:
                layer_input = self.sigmoid_layers[-1].output
            else:
                layer_input = self.reconstruction_layers[-1].output

            if self.tied_weights:
               W = self.sigmoid_layers[i-1].W.T
               b = self.rbm_layers[i-1].vbias
            else:
               W = b = None

            # the output size if we are on the first layer
            if i == 1 :
                output_size = n_ins
                activation = None
            else:
                output_size = hidden_layers_sizes[i-2]
                activation = T.nnet.sigmoid

            input_size = hidden_layers_sizes[i-1]

            reconstruction_layer = HiddenLayer(numpy_rng, layer_input, input_size, output_size, W, b, activation)

            if not self.tied_weights:
               self.params.extend(reconstruction_layer.params)

            # add the layer to our list of layers
            self.reconstruction_layers.append(reconstruction_layer)

        self.xhat = T.nnet.sigmoid( self.reconstruction_layers[-1].output )

        # otherwise we'll end up with a bunch of extra params and theano will complain
        self.reconstruction_params = list(self.params)

        self.global_cross_entropy =  T.mean(-T.sum( self.x*T.log(self.xhat) + (1-self.x)*T.log(1-self.xhat), axis=1 ))
        self.global_mse = T.mean(T.sum((self.x - self.xhat)**2.0,axis=1))

        if reconstruction_cost == 'cross_entropy':
           self.global_pretraining_cost = self.global_cross_entropy
        elif reconstruction_cost == 'mse':
            self.global_pretraining_cost = self.global_mse
        else:
            raise NotImplementedError('Invalid reconstruction error\
                                      specified')


        if supervised_training == 'standard':
           # We now need to add a logistic layer on top of the MLP
           self.logLayer = LogisticRegression(\
                         input = self.sigmoid_layers[-1].output,\
                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
           self.params.extend(self.logLayer.params)

           # compute the cost for second phase of training, defined as the
           # negative log likelihood of the logistic regression (output) layer
           self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

           # compute the gradients with respect to the model parameters
           # symbolic variable that points to the number of errors made on the
           # minibatch given by self.x and self.y
           self.errors = self.logLayer.errors(self.y)

           # compute prediction as class whose probability is maximal in
           # symbolic form
           self.y_pred=T.argmax(self.p_y_given_x, axis=1)


        elif supervised_training == 'russ':

           # compute vector of class-membership probabilities in symbolic form
           p_y_given_x = T.nnet.softmax(self.sigmoid_layers[-1].output)
           self.y_pred=T.argmax(p_y_given_x, axis=1)
           self.finetune_cost = -T.mean(T.log(p_y_given_x[T.arange(self.y.shape[0]),self.y]))
           self.errors = T.mean(T.neq(self.y_pred, self.y))

        else:
           print 'Unsupport supervised training method', supervised_training
           raise NotImplementedError
Beispiel #57
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset='../data/mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], 12, 12),
            filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2))

    # the TanhLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4,
                         n_out=500, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    temp_model = theano.function([index], layer3.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function([index], cost, updates=updates,
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [temp_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #58
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=10,
                    hidden_size=10,
                    L2_weight=0.0001,
                    para_len_limit=400,
                    q_len_limit=40,
                    max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #     id2word = {y:x for x,y in overall_word2id.iteritems()}
    #     word2vec=load_word2vec()
    #     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    norm_extraF = normalize_matrix(extraF)

    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]

    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features
    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras

    load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    concate_paragraph_input = T.concatenate(
        [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1)

    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    #     #LSTM
    #     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    #     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    #     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)

    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    #     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor = questions_model.output_tensor

    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #     #LSTM for questions
    #     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    #     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    #     questions_reps_tensor=questions_model.output_tensor

    #use CNN for question modeling
    #     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
    #     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
    #     Q_conv_para=[conv_W, conv_b]
    #     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
    #             image_shape=(batch_size, 1, emb_size, q_len_limit),
    #             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
    #     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
    #     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
    #     masked_conv_output=conv_output*gru_mask
    #     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))

    #     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
    #     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
    #     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
    #     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
    #     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
    #     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
    #     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)

    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix = para_matrix.T
        interaction_matrix = T.dot(transpose_para_matrix,
                                   q_matrix)  #(para_len, q_len)
        norm_interaction_matrix = T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)  #(len, para_len)

    batch_q_reps, updates = theano.scan(
        fn=example_in_batch,
        outputs_info=None,
        sequences=[para_reps, questions_reps_tensor
                   ])  #batch_q_reps (batch, hidden, para_len)

    #attention distributions

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    transformed_para_reps = T.maximum(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0)  #relu
    transformed_q_reps = T.maximum(
        T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = transformed_para_reps + transformed_q_reps

    #     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
    #     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
    #     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b]
    #     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)
    #     accu_both=accumu_model.output_tensor.transpose((0,2,1))

    prior_att = T.concatenate([add_both, norm_extraF], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    #     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+ConvGRU_1.error#

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray([
                    train_para_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_Q_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_label_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_para_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_feature_matrixlist[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                F1_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list

                    Q_list_inword = test_Q_list_word[
                        test_para_id:test_para_id + batch_size]
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set = para_gold_ansset_list[q]
                        #                         print test_para_wordlist_list[q]
                        #                         print Q_list_inword[q]
                        #                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match += 1
                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        F1_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc = F1_match / q_amount
                exact_acc = exact_match / q_amount
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(
                            rootPath + 'Best_Paras_conv_' + str(max_exact_acc),
                            params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #59
0
    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
        self.recall = self.logLayer.recall(self.y)
Beispiel #60
0
    def build_network(self):
        # allocate symbolic variables for the data
        self.x = dtensor5Float(
            'x')  # the data is presented as rasterized images
        self.y = dtensor5IntVector('y')  # and again

        self.training_data = theano.shared(np.zeros(self.input_shape,
                                                    dtype=floatX),
                                           borrow=True)
        self.training_data_labels = theano.shared(np.zeros(
            (self.input_shape[0], ), dtype=dtensor5IntVector),
                                                  borrow=True)
        self.testing_data = theano.shared(np.zeros(self.input_shape,
                                                   dtype=floatX),
                                          borrow=True)
        self.testing_data_labels = theano.shared(np.zeros(
            (self.input_shape[0], ), dtype=dtensor5IntVector),
                                                 borrow=True)

        input_to_layer = self.x
        input_shape_to_layer = self.input_shape

        for i in range(0, len(self.convolution_layer_shapes)):
            print("Setting input layer shape to")
            print(input_shape_to_layer)
            layer = LeNetConvPoolLayer(
                self.rng,
                input=input_to_layer,
                image_shape=input_shape_to_layer,
                filter_shape=self.convolution_layer_shapes[i],
                poolsize=self.pooling_layer_shapes[i])
            self.layers.append(layer)
            input_to_layer = layer.output
            input_shape_to_layer = layer.output_shape()

        layer2_input = input_to_layer.flatten(2)

        # construct a fully-connected sigmoidal layer
        self.layer2 = HiddenLayer(self.rng,
                                  input=layer2_input,
                                  n_in=np.prod(input_shape_to_layer[1:]),
                                  n_out=500,
                                  activation=T.tanh)

        # classify the values of the fully-connected sigmoidal layer
        self.layer3 = LogisticRegression(input=self.layer2.output,
                                         n_in=500,
                                         n_out=self.n_classes)

        #self.vy = dtensor5IntVector('vy') # and again
        self.batch_x = dtensor5Float(
            'batch_x')  # the data is presented as rasterized images
        self.batch_y = dtensor5IntVector('batch_y')  # and again

        self.validate_model = theano.function(
            inputs=[],
            outputs=self.layer3.errors(self.y),
            givens={
                self.x: self.testing_data,
                self.y: self.testing_data_labels
            })

        # create a list of all model parameters to be fit by gradient descent
        self.params = self.layer3.params + self.layer2.params
        for i in self.layers:
            self.params += i.params
        #self.params = self.layers[0].params

        # the cost we minimize during training is the NLL of the model
        self.cost = self.layer3.negative_log_likelihood(self.y)

        # create a list of gradients for all model parameters
        self.grads = T.grad(self.cost, self.params)
        # train_model is a function that updates the model parameters by
        # SGD Since this model has many parameters, it would be tedious to
        # manually create an update rule for each model parameter. We thus
        # create the updates list by automatically looping over all
        # (params[i], grads[i]) pairs.
        self.updates = [(param_i, param_i - self.learning_rate * grad_i)
                        for param_i, grad_i in zip(self.params, self.grads)]

        self.train_model = theano.function(inputs=[],
                                           outputs=self.cost,
                                           updates=self.updates,
                                           givens={
                                               self.x: self.training_data,
                                               self.y:
                                               self.training_data_labels
                                           })