예제 #1
0
    def __init__(self, U, sentence_dim=200, wv_dim = 100, ngram_filters=[3, 4, 5], dropout=[0.5], hidden=[100, 1], activations=[ReLU], batch_size=50):
        super(SentenceCNN, self).__init__()

        self.rng = np.random.RandomState(3435)

        # -- U is a matrix with U.shape = (size_vocab, wv_dim)
        self.U = U

        # -- the maximum sentence length...
        self.sentence_dim = sentence_dim
        self.wv_dim = wv_dim

        # -- list of filter sizes we want to consider...
        self.ngram_filters = ngram_filters
        self.dropout = dropout
        self.hidden = hidden
        self.batch_size = batch_size

        self.feature_maps = hidden[0]

        filter_shapes = []
        pool_sizes = []

        # -- get the conv parameters from each ngram...
        for ngram in ngram_filters:
            # -- we want to look at (ngram, wv_dim) sized patches...
            filter_shapes.append((self.feature_maps, 1, ngram, wv_dim))
            pool_sizes.append((sentence_dim - ngram + 1, wv_dim - wv_dim + 1))

        # -- define the model architecture

        # -- this is the index of the dataset...
        self.index = T.lscalar()

        # -- this is the matrix of indices for words in a sentence...
        self.x = T.matrix('x')   

        # -- this is the vector of target values...
        self.y = T.ivector('y')
        # self.y = T.fvector('y')
        # self.y = T.matrix('y')

        # -- initialize our wordvectors!
        self.Words = theano.shared(value = self.U, name = "Words")


        self.zero_vec_tensor = T.vector()
        self.zero_vec = np.zeros(wv_dim).astype('float32')
        self.set_zero = theano.function([self.zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:], self.zero_vec_tensor))])
    
        # -- make the actual image from the word vectors!
        self.sentence_image = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((self.x.shape[0], 1, self.x.shape[1], self.Words.shape[1]))                                  
        
        # -- make our model split
        self.conv_layers = []
        self.conv_output_buffer = []

        CONVOLUTION_NONLINEARITY = 'relu'

        STATIC_WV = False

        for i in xrange(len(ngram_filters)):
            # -- get the filter sizes for this particular ngram filter.
            filter_shape = filter_shapes[i]
            pool_size = pool_sizes[i]

            conv_layer = LeNetConvPoolLayer(rng=self.rng, 
                                            input=self.sentence_image,
                                            image_shape=(self.batch_size, 1, self.sentence_dim, self.wv_dim),
                                            filter_shape=filter_shape, 
                                            poolsize=pool_size, 
                                            non_linear=CONVOLUTION_NONLINEARITY)

            conv_out = conv_layer.output.flatten(2)

            # -- concatenate this stuff
            self.conv_layers.append(conv_layer)
            self.conv_output_buffer.append(conv_out)


        # -- convert the parallel outputs into a tensor!
        self.conv_outputs = T.concatenate(self.conv_output_buffer, 1)

        # -- we need to flatten them output!
        self.hidden[0] = self.feature_maps * len(ngram_filters)    

        # self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=False)
        self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=True)
        
        # -- define parameters of the model and update functions using adadelta
        self.params = self.fully_connected.params     
        for conv_layer in self.conv_layers:
            self.params += conv_layer.params
        if not STATIC_WV:
            #if word vectors are allowed to change, add them as model parameters
            self.params += [self.Words]

        lr_decay = 0.95
        sqr_norm_lim = 9
        
        # now, need to hack away at th MLP class...    
        self.cost = self.fully_connected.cost(self.y) 
        self.dropout_cost = self.fully_connected.dropout_cost(self.y)           
        self.grad_updates = sgd_updates_adadelta(self.params, self.dropout_cost, lr_decay, 1e-6, sqr_norm_lim) 
예제 #2
0
class SentenceCNN(object):
    """docstring for SentenceCNN"""
    def __init__(self, U, sentence_dim=200, wv_dim = 100, ngram_filters=[3, 4, 5], dropout=[0.5], hidden=[100, 1], activations=[ReLU], batch_size=50):
        super(SentenceCNN, self).__init__()

        self.rng = np.random.RandomState(3435)

        # -- U is a matrix with U.shape = (size_vocab, wv_dim)
        self.U = U

        # -- the maximum sentence length...
        self.sentence_dim = sentence_dim
        self.wv_dim = wv_dim

        # -- list of filter sizes we want to consider...
        self.ngram_filters = ngram_filters
        self.dropout = dropout
        self.hidden = hidden
        self.batch_size = batch_size

        self.feature_maps = hidden[0]

        filter_shapes = []
        pool_sizes = []

        # -- get the conv parameters from each ngram...
        for ngram in ngram_filters:
            # -- we want to look at (ngram, wv_dim) sized patches...
            filter_shapes.append((self.feature_maps, 1, ngram, wv_dim))
            pool_sizes.append((sentence_dim - ngram + 1, wv_dim - wv_dim + 1))

        # -- define the model architecture

        # -- this is the index of the dataset...
        self.index = T.lscalar()

        # -- this is the matrix of indices for words in a sentence...
        self.x = T.matrix('x')   

        # -- this is the vector of target values...
        self.y = T.ivector('y')
        # self.y = T.fvector('y')
        # self.y = T.matrix('y')

        # -- initialize our wordvectors!
        self.Words = theano.shared(value = self.U, name = "Words")


        self.zero_vec_tensor = T.vector()
        self.zero_vec = np.zeros(wv_dim).astype('float32')
        self.set_zero = theano.function([self.zero_vec_tensor], updates=[(self.Words, T.set_subtensor(self.Words[0,:], self.zero_vec_tensor))])
    
        # -- make the actual image from the word vectors!
        self.sentence_image = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((self.x.shape[0], 1, self.x.shape[1], self.Words.shape[1]))                                  
        
        # -- make our model split
        self.conv_layers = []
        self.conv_output_buffer = []

        CONVOLUTION_NONLINEARITY = 'relu'

        STATIC_WV = False

        for i in xrange(len(ngram_filters)):
            # -- get the filter sizes for this particular ngram filter.
            filter_shape = filter_shapes[i]
            pool_size = pool_sizes[i]

            conv_layer = LeNetConvPoolLayer(rng=self.rng, 
                                            input=self.sentence_image,
                                            image_shape=(self.batch_size, 1, self.sentence_dim, self.wv_dim),
                                            filter_shape=filter_shape, 
                                            poolsize=pool_size, 
                                            non_linear=CONVOLUTION_NONLINEARITY)

            conv_out = conv_layer.output.flatten(2)

            # -- concatenate this stuff
            self.conv_layers.append(conv_layer)
            self.conv_output_buffer.append(conv_out)


        # -- convert the parallel outputs into a tensor!
        self.conv_outputs = T.concatenate(self.conv_output_buffer, 1)

        # -- we need to flatten them output!
        self.hidden[0] = self.feature_maps * len(ngram_filters)    

        # self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=False)
        self.fully_connected = MLPDropout(self.rng, input=self.conv_outputs, layer_sizes=self.hidden, activations=activations, dropout_rates=dropout, classifier=True)
        
        # -- define parameters of the model and update functions using adadelta
        self.params = self.fully_connected.params     
        for conv_layer in self.conv_layers:
            self.params += conv_layer.params
        if not STATIC_WV:
            #if word vectors are allowed to change, add them as model parameters
            self.params += [self.Words]

        lr_decay = 0.95
        sqr_norm_lim = 9
        
        # now, need to hack away at th MLP class...    
        self.cost = self.fully_connected.cost(self.y) 
        self.dropout_cost = self.fully_connected.dropout_cost(self.y)           
        self.grad_updates = sgd_updates_adadelta(self.params, self.dropout_cost, lr_decay, 1e-6, sqr_norm_lim) 

    def fit(self, X, y, validation, n_epochs=5):
        '''
        `validation` is a *mandatory* tuple of 
        length 2, with elements (X_val, y_val)
        '''

        sys.stdout.write('Building datasets...')
        X_val, y_val = validation

        np.random.seed(3435)

        batch_size = self.batch_size

        if X.shape[0] % batch_size > 0:
            extra_data_num = batch_size - X.shape[0] % batch_size

            # -- get random ix from first dimension
            shuffle_ix = np.random.permutation(X.shape[0])    
            extra_X = X[shuffle_ix[:extra_data_num]]
            extra_y = y[shuffle_ix[:extra_data_num]]

            X_training = np.append(X,extra_X,axis=0)
            y_training = np.append(y,extra_y,axis=0)
        else:
            X_training = X
            y_training = y

        # -- shuffle the training data
        shuffle_ix = np.random.permutation(X.shape[0]) 

        X_training = X_training[shuffle_ix]
        y_training = y_training[shuffle_ix]

        # -- find the number of batches we can train on...
        n_batches = X_training.shape[0] / batch_size
        n_train_batches = int(np.round(n_batches*0.9))
        
        #divide train set into train/val sets 
        val_set_x = X_val 
        val_set_y = y_val  

        sys.stdout.write('done.\nCopying to GPU/CPU shared env...')

        # -- get our training and dev sets...
        train_set_x, train_set_y = shared_dataset((
                X_training[:n_train_batches * batch_size, :], 
                # y_training[:n_train_batches * batch_size, :]
                y_training[:n_train_batches * batch_size]
            ))

        dev_set_x, dev_set_y = shared_dataset((
                X_training[n_train_batches*batch_size:, :], 
                # y_training[n_train_batches*batch_size:, :]
                y_training[n_train_batches*batch_size:]
            ))

        # train_set_x, train_set_y = shared_dataset((train_set[:,:img_h],train_set[:,-1]))
        # dev_set_x, dev_set_y = shared_dataset((dev_set[:,:img_h],dev_set[:,-1]))

        n_dev_batches = n_batches - n_train_batches

        #compile theano functions to get train/val/test errors
        sys.stdout.write('done.\nCompiling symbolic graph...')

        # -- gets the error on the dev set...
        validate_model = theano.function([self.index], self.fully_connected.errors(self.y),
                givens = {
                    self.x: dev_set_x[self.index * batch_size: (self.index + 1) * batch_size],
                    self.y: dev_set_y[self.index * batch_size: (self.index + 1) * batch_size]
                }
            )

        # -- gets the error on the training set...
        test_model = theano.function([self.index], self.fully_connected.errors(self.y),
                givens = {
                    self.x: train_set_x[self.index * batch_size: (self.index + 1) * batch_size],
                    self.y: train_set_y[self.index * batch_size: (self.index + 1) * batch_size]
                }
            )       
        
        # -- actually trains the model!        
        train_model = theano.function([self.index], self.cost, updates=self.grad_updates,
                givens={
                    self.x: train_set_x[self.index*batch_size:(self.index+1)*batch_size],
                    self.y: train_set_y[self.index*batch_size:(self.index+1)*batch_size]
                }
            ) 


        sys.stdout.write('done.\nBuilding predictive model...')
        test_pred_layers = []
        test_size = val_set_x.shape[0]
        test_layer0_input = self.Words[T.cast(self.x.flatten(),dtype="int32")].reshape((test_size,1,self.sentence_dim,self.Words.shape[1]))
        
        for conv_layer in self.conv_layers:
            test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
            test_pred_layers.append(test_layer0_output.flatten(2))
        test_layer1_input = T.concatenate(test_pred_layers, 1)
        test_y_pred = self.fully_connected.predict(test_layer1_input)
        test_error = self.fully_connected.cost(self.y)

        # -- function to test model.
        test_model_all = theano.function([self.x, self.y], test_error)   
        sys.stdout.write('done.\n')
        
        #start training over mini-batches
        print 'Starting training...'
        epoch = 0
        best_val_perf = 0
        val_perf = 0
        test_perf = 0       
        cost_epoch = 0   
        shuffle_batch = True 
        while (epoch < n_epochs):        
            epoch = epoch + 1
            if shuffle_batch:
                for minibatch_index in np.random.permutation(range(n_train_batches)):
                    cost_epoch = train_model(minibatch_index)
                    self.set_zero(self.zero_vec)
            else:
                for minibatch_index in xrange(n_train_batches):
                    cost_epoch = train_model(minibatch_index)  
                    self.set_zero(self.zero_vec)
            train_losses = [test_model(i) for i in xrange(n_train_batches)]
            train_perf = np.mean(train_losses)
            val_losses = [validate_model(i) for i in xrange(n_dev_batches)]
            val_perf = np.mean(val_losses)                        
            print('epoch %i, train perf %f %%, val perf %f' % (epoch, train_perf, val_perf))
            if val_perf <= best_val_perf:
                best_val_perf = val_perf
                test_loss = test_model_all(val_set_x,val_set_y)        
                test_perf = test_loss         
        return test_perf