Example #1
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy):
        """ This function is to build finetune functions and to update gradients

        :param train_shared_xy: theano shared variable for input and output training data
        :type train_shared_xy: tuple of shared variable
        :param valid_shared_xy: theano shared variable for input and output development data
        :type valid_shared_xy: tuple of shared variable
        :returns: finetune functions for training and development

        """

        logger = logging.getLogger("DNN initialization")

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        lr = T.scalar('lr', dtype=theano.config.floatX)
        mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum

        cost = self.finetune_cost  #+ self.L2_reg * self.L2_sqr

        gparams = T.grad(cost, self.params)

        # use optimizer
        if self.optimizer == 'sgd':
            # zip just concatenate two lists
            updates = OrderedDict()

            for param, gparam in zip(self.params, gparams):
                weight_update = self.updates[param]
                upd = mom * weight_update - lr * gparam
                updates[weight_update] = upd
                updates[param] = param + upd
        elif self.optimizer == 'adam':
            updates = compile_ADAM_train_function(self,
                                                  gparams,
                                                  learning_rate=lr)
        elif self.optimizer == 'rprop':
            updates = compile_RPROP_train_function(self, gparams)
        else:
            logger.critical(
                "This optimizer: %s is not supported right now! \n Please use one of the following: sgd, adam, rprop\n"
                % (self.optimizer))
            sys.exit(1)

        train_model = theano.function(
            inputs=[lr, mom],  #index, batch_size
            outputs=self.errors,
            updates=updates,
            givens={
                self.x:
                train_set_x,  #[index*batch_size:(index + 1)*batch_size]
                self.y: train_set_y,
                self.is_train: np.cast['int32'](1)
            },
            on_unused_input='ignore')

        valid_model = theano.function(inputs=[],
                                      outputs=self.errors,
                                      givens={
                                          self.x: valid_set_x,
                                          self.y: valid_set_y,
                                          self.is_train: np.cast['int32'](0)
                                      },
                                      on_unused_input='ignore')

        return train_model, valid_model
Example #2
0
    def build_finetune_functions(self,
                                 train_shared_xy,
                                 valid_shared_xy,
                                 use_lhuc=False,
                                 layer_index=0):
        """ This function is to build finetune functions and to update gradients

        :param train_shared_xy: theano shared variable for input and output training data
        :type train_shared_xy: tuple of shared variable
        :param valid_shared_xy: theano shared variable for input and output development data
        :type valid_shared_xy: tuple of shared variable
        :returns: finetune functions for training and development

        """

        logger = logging.getLogger("DNN initialization")

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        lr = T.scalar('lr', dtype=theano.config.floatX)
        mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum

        cost = self.finetune_cost  # + self.L2_reg * self.L2_sqr

        ## added for LHUC
        if use_lhuc:
            # In lhuc the parameters are only scaling parameters which have the name 'c'
            self.lhuc_params = []
            for p in self.params:
                if p.name == 'c':
                    self.lhuc_params.append(p)
            params = self.lhuc_params
            gparams = T.grad(cost, params)
        else:
            params = self.params
            gparams = T.grad(cost, params)

        freeze_params = 0
        for layer in range(layer_index):
            freeze_params += len(self.rnn_layers[layer].params)

        # use optimizer
        if self.optimizer == 'sgd':
            # zip just concatenate two lists
            updates = OrderedDict()

            for i, (param, gparam) in enumerate(zip(params, gparams)):
                weight_update = self.updates[param]
                upd = mom * weight_update - lr * gparam
                updates[weight_update] = upd

                # freeze layers and update weights
                if i >= freeze_params:
                    updates[param] = param + upd

        elif self.optimizer == 'adam':
            updates = compile_ADAM_train_function(self,
                                                  gparams,
                                                  learning_rate=lr)
        elif self.optimizer == 'rprop':
            updates = compile_RPROP_train_function(self, gparams)
        else:
            logger.critical(
                "This optimizer: %s is not supported right now! \n Please use one of the following: sgd, adam, rprop\n"
                % (self.optimizer))
            sys.exit(1)

        train_model = theano.function(
            inputs=[lr, mom],  # index, batch_size
            outputs=self.errors,
            updates=updates,
            givens={
                self.x:
                train_set_x,  # [index*batch_size:(index + 1)*batch_size]
                self.y: train_set_y,
                self.is_train: np.cast['int32'](1)
            },
            on_unused_input='ignore')

        valid_model = theano.function(inputs=[],
                                      outputs=self.errors,
                                      givens={
                                          self.x: valid_set_x,
                                          self.y: valid_set_y,
                                          self.is_train: np.cast['int32'](0)
                                      },
                                      on_unused_input='ignore')

        return train_model, valid_model
Example #3
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.fscalar('learning_rate')
        momentum = T.fscalar('momentum')

        layer_size = len(self.params)
        lr_list = []
        for i in range(layer_size):
            lr_list.append(learning_rate)

        ##top 2 layers use a smaller learning rate
        if layer_size > 4:
            for i in range(layer_size-4, layer_size):
                lr_list[i] = learning_rate * 0.5

        # compute list of fine-tuning updates
        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        if self.use_rprop == 0:

            updates = OrderedDict()
            layer_index = 0
            for dparam, gparam in zip(self.delta_params, gparams):
                updates[dparam] = momentum * dparam - gparam * lr_list[layer_index]
                layer_index += 1

            for dparam, param in zip(self.delta_params, self.params):
                updates[param] = param + updates[dparam]

            train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
                  theano.Param(momentum, default = 0.5)],
                  outputs=self.errors,
                  updates=updates,
                  on_unused_input='ignore',
                  givens={self.x: train_set_x[index * batch_size:
                                              (index + 1) * batch_size],
                          self.y: train_set_y[index * batch_size:
                                              (index + 1) * batch_size]})

        elif self.use_rprop:
            updates = compile_RPROP_train_function(self, gparams)

            ## retain learning rate and momentum to make interface backwards compatible,
            ## but we won't use them, means we have to use on_unused_input='warn'.
            ## Otherwise same function for RPROP or otherwise -- can move this block outside if clause.
            train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
                  theano.Param(momentum, default = 0.5)],
                  outputs=self.errors,
                  updates=updates,
                  on_unused_input='warn',
                  givens={self.x: train_set_x[index * batch_size:
                                              (index + 1) * batch_size],
                          self.y: train_set_y[index * batch_size:
                                              (index + 1) * batch_size]})

        valid_fn = theano.function([],
              outputs=self.errors,
              on_unused_input='ignore',
              givens={self.x: valid_set_x,
                      self.y: valid_set_y})

        valid_score_i = theano.function([index],
              outputs=self.errors,
              on_unused_input='ignore',
              givens={self.x: valid_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: valid_set_y[index * batch_size:
                                          (index + 1) * batch_size]})
        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in range(n_valid_batches)]

        return train_fn, valid_fn
Example #4
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size):

        (train_set_x, train_set_x_proj, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_x_proj, valid_set_y) = valid_shared_xy

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.fscalar('learning_rate') ## osw temp
        momentum = T.fscalar('momentum')           ## osw temp
        ##proj_learning_rate = T.dscalar('proj_learning_rate') ## osw temp
        
        layer_size = len(self.params)
        lr_list = []
        for i in xrange(layer_size):
            lr_list.append(learning_rate)

        ##top 2 layers use a smaller learning rate
        if layer_size > 4:
            for i in range(layer_size-4, layer_size):
                lr_list[i] = learning_rate * 0.5

        # compute list of fine-tuning updates
        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        def make_updates_plain(param_list, delta_param_list, gparam_list, lr_list, params_to_update):
            updates = OrderedDict()
            for dparam, gparam, lrate in zip(delta_param_list, gparam_list, lr_list):
                updates[dparam] = momentum * dparam - gparam * lrate
            for dparam, param in zip(delta_param_list, param_list):
                updates[param] = param + updates[dparam]            
            return updates
            
    
        ## Define updates over various subsets of model parameters. These will be used
        ## in various compiled training/inference functions.
        
        ## As a guide to the structure of params, params for 2 hidden layers, projection, 
        ## first split layer, will look like this:
        # i:   0       1    2     3    4   5    6    7
        ##   [W_proj; W_1a, W_1b, b_1; W_2 b_2; W_o, b_o]

        '''
        updates -- all params
        subword_updates: exclude parameters at 0 and 2  -- proj. weights and proj. half of split layer
        word_updates: exclude all but the word half of the split layer, and bias of that layer, and projection
        projection_updates: exclude all but parameters at 0 -- projection layer
        '''
        
        all_params = range(len(self.params))
        subword_params = [i for i in all_params if i not in [0,2]]
        word_params = [0,2,3]
        projection_params = [0]
        

        if self.use_rprop:
            print '========USING RPROP ========='
            updates = compile_RPROP_train_function(self, gparams)
            subword_updates = compile_RPROP_train_function(self, gparams, params_to_update=subword_params)
            word_updates = compile_RPROP_train_function(self, gparams, params_to_update=word_params)
            projection_updates = compile_RPROP_train_function(self, gparams, params_to_update=projection_params)
            on_unused_input_value = 'warn'
             
        else:    
            print '========NOT USING RPROP ========='            
            updates = make_updates_plain(self.params, self.delta_params, gparams, lr_list, all_params)
            subword_updates = make_updates_plain(self.params, self.delta_params, gparams, lr_list, subword_params)
            word_updates = make_updates_plain(self.params, self.delta_params, gparams, lr_list, word_params)
            projection_updates = make_updates_plain(self.params, self.delta_params, gparams, lr_list, projection_params)
            on_unused_input_value = 'raise'  ## Theano's default 
            
        ##### OLDER VERSION:--
        '''
        ## All updates:
        updates = OrderedDict()
        layer_index = 0
        for dparam, gparam in zip(self.delta_params, gparams):
            updates[dparam] = momentum * dparam - gparam * lr_list[layer_index]
            layer_index += 1

        for dparam, param in zip(self.delta_params, self.params):
            updates[param] = param + updates[dparam]

        ## These updates exclude parameters at 0 and 2  -- proj. weights and proj. half of split layer
        subword_updates = OrderedDict()
        for (i, (dparam, gparam)) in enumerate(zip(self.delta_params, gparams)):
            if i not in [0,2]:  ## proj weights and proj half of split layer
                subword_updates[dparam] = momentum * dparam - gparam * lr_list[i]

        for (i, (dparam, param)) in enumerate(zip(self.delta_params, self.params)):
            if i not in [0,2]:  ## proj weights and proj half of split layer
                subword_updates[param] = param + subword_updates[dparam]

        ## These updates exclude parameters at 1 -- subword half of split layer
        ### NO!!! -- just the word half of the split layer, and bias of that layer
        word_updates = OrderedDict()
        for (i, (dparam, gparam)) in enumerate(zip(self.delta_params, gparams)):
            if i in [0,2,3]:  
                word_updates[dparam] = momentum * dparam - gparam * lr_list[i]

        for (i, (dparam, param)) in enumerate(zip(self.delta_params, self.params)):
            if i in [0,2,3]: 
                word_updates[param] = param + word_updates[dparam]


        ## These updates exclude all but parameters at 0 -- projection layer
        projection_updates = OrderedDict()
        for (i, (dparam, gparam)) in enumerate(zip(self.delta_params, gparams)):
            if i == 0: 
                projection_updates[dparam] = momentum * dparam - gparam * lr_list[i]

        for (i, (dparam, param)) in enumerate(zip(self.delta_params, self.params)):
            if i == 0: 
                projection_updates[param] = param + projection_updates[dparam]
        '''


        ## Update all params -- maybe never used:
        print 'compile train_all_fn'
        train_all_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
              theano.Param(momentum, default = 0.5)],
              outputs=self.errors,
              updates=updates,
	      on_unused_input=on_unused_input_value,
              givens={self.x: train_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.x_proj: train_set_x_proj[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: train_set_y[index * batch_size:
                                          (index + 1) * batch_size]})
                     
        ## Update all but word-projection part of split first hidden layer and projection weights  
        print 'compile train_subword_fn'                        
        train_subword_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
              theano.Param(momentum, default = 0.5)],
              outputs=self.errors,
              updates=subword_updates,
              on_unused_input=on_unused_input_value,              
              givens={self.x: train_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.x_proj: train_set_x_proj[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: train_set_y[index * batch_size:
                                          (index + 1) * batch_size]})

        print 'compile train_word_fn' 
        train_word_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
              theano.Param(momentum, default = 0.5)],
              outputs=self.errors,
              updates=word_updates,
              on_unused_input=on_unused_input_value,              
              givens={self.x: train_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.x_proj: train_set_x_proj[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: train_set_y[index * batch_size:
                                          (index + 1) * batch_size]})                                          

        print 'compile infer_projections_fn -- NB: to operate by default on validation set' 
        infer_projections_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
              theano.Param(momentum, default = 0.5)],
              outputs=self.errors,
              updates=projection_updates,
              on_unused_input=on_unused_input_value,              
              givens={self.x: valid_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.x_proj: valid_set_x_proj[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: valid_set_y[index * batch_size:
                                          (index + 1) * batch_size]})                                                                               
                                  
                           
                                          
        valid_fn = theano.function([], 
              outputs=self.errors,
              givens={self.x: valid_set_x,
                      self.x_proj: valid_set_x_proj,
                      self.y: valid_set_y})

        valid_score_i = theano.function([index], 
              outputs=self.errors,
              givens={self.x: valid_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.x_proj: valid_set_x_proj[index * batch_size:
                                          (index + 1) * batch_size],                                        
                      self.y: valid_set_y[index * batch_size:
                                          (index + 1) * batch_size]})
        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        print 'finished Theano function compilation'  
        return train_all_fn, train_subword_fn,  train_word_fn, infer_projections_fn, valid_fn, valid_score_i
Example #5
0
    def build_finetune_functions_S2SPF(self,
                                       train_shared_xydf,
                                       valid_shared_xydf,
                                       layer_index=6):
        """ This function is to build finetune functions and to update gradients
        
        :param train_shared_xy: theano shared variable for input and output training data 
        :type train_shared_xy: tuple of shared variable
        :param valid_shared_xy: theano shared variable for input and output development data
        :type valid_shared_xy: tuple of shared variable
        :returns: finetune functions for training and development
        
        """

        (train_set_x, train_set_y, train_set_d,
         train_set_f) = train_shared_xydf
        (valid_set_x, valid_set_y, valid_set_d,
         valid_set_f) = valid_shared_xydf

        lr = T.scalar('lr', dtype=theano.config.floatX)
        mom = T.scalar('mom', dtype=theano.config.floatX)  # momentum

        cost = self.finetune_cost  # + self.L2_reg * self.L2_sqr

        params = self.params
        gparams = T.grad(cost, params)

        encoder_params = 0
        for layer in range(layer_index):
            encoder_params += len(self.rnn_layers[layer].params)

        # use optimizer
        if self.optimizer == 'sgd':
            # zip just concatenate two lists
            updates = OrderedDict()

            for i, (param, gparam) in enumerate(zip(params, gparams)):
                weight_update = self.updates[param]
                if i >= encoder_params:
                    upd = mom * weight_update - lr * gparam
                else:
                    upd = mom * weight_update - (lr * 2) * gparam
                updates[weight_update] = upd
                updates[param] = param + upd

        elif self.optimizer == 'adam':
            updates = compile_ADAM_train_function(self,
                                                  gparams,
                                                  learning_rate=lr)
        elif self.optimizer == 'rprop':
            updates = compile_RPROP_train_function(self, gparams)
        else:
            logger.critical(
                "This optimizer: %s is not supported right now! \n Please use one of the following: sgd, adam, rprop\n"
                % (self.optimizer))
            sys.exit(1)

        train_model = theano.function(inputs=[lr, mom],
                                      outputs=self.errors,
                                      updates=updates,
                                      givens={
                                          self.x: train_set_x,
                                          self.y: train_set_y,
                                          self.d: train_set_d,
                                          self.f: train_set_f,
                                          self.is_train: np.cast['int32'](1)
                                      },
                                      on_unused_input='ignore')

        valid_model = theano.function(inputs=[],
                                      outputs=self.errors,
                                      givens={
                                          self.x: valid_set_x,
                                          self.y: valid_set_y,
                                          self.d: valid_set_d,
                                          self.f: valid_set_f,
                                          self.is_train: np.cast['int32'](0)
                                      },
                                      on_unused_input='ignore')

        return train_model, valid_model
Example #6
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.fscalar('learning_rate')
        momentum = T.fscalar('momentum')         

        layer_size = len(self.params)
        lr_list = []
        for i in xrange(layer_size):
            lr_list.append(learning_rate)

        ##top 2 layers use a smaller learning rate
        if layer_size > 4:
            for i in range(layer_size-4, layer_size):
                lr_list[i] = learning_rate * 0.5

        # compute list of fine-tuning updates
        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        if self.use_rprop == 0:
        
            updates = OrderedDict()
            layer_index = 0
            for dparam, gparam in zip(self.delta_params, gparams):
                updates[dparam] = momentum * dparam - gparam * lr_list[layer_index]
                layer_index += 1

            for dparam, param in zip(self.delta_params, self.params):
                updates[param] = param + updates[dparam]

            train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
                  theano.Param(momentum, default = 0.5)],
                  outputs=self.errors,
                  updates=updates,
                  on_unused_input='ignore',
                  givens={self.x: train_set_x[index * batch_size:
                                              (index + 1) * batch_size],
                          self.y: train_set_y[index * batch_size:
                                              (index + 1) * batch_size]})

        elif self.use_rprop:        
            updates = compile_RPROP_train_function(self, gparams)
            
            ## retain learning rate and momentum to make interface backwards compatible,
            ## but we won't use them, means we have to use on_unused_input='warn'.
            ## Otherwise same function for RPROP or otherwise -- can move this block outside if clause.              
            train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001),
                  theano.Param(momentum, default = 0.5)],
                  outputs=self.errors,
                  updates=updates,
                  on_unused_input='warn',
                  givens={self.x: train_set_x[index * batch_size:
                                              (index + 1) * batch_size],
                          self.y: train_set_y[index * batch_size:
                                              (index + 1) * batch_size]})   
                                                                                
        valid_fn = theano.function([], 
              outputs=self.errors,
              on_unused_input='ignore',              
              givens={self.x: valid_set_x,
                      self.y: valid_set_y})

        valid_score_i = theano.function([index], 
              outputs=self.errors,
              on_unused_input='ignore',              
              givens={self.x: valid_set_x[index * batch_size:
                                          (index + 1) * batch_size],
                      self.y: valid_set_y[index * batch_size:
                                          (index + 1) * batch_size]})
        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        return train_fn, valid_fn
Example #7
0
    def build_finetune_functions_S2SPF(self, train_shared_xydf, valid_shared_xydf, layer_index=6):
        """ This function is to build finetune functions and to update gradients
        
        :param train_shared_xy: theano shared variable for input and output training data 
        :type train_shared_xy: tuple of shared variable
        :param valid_shared_xy: theano shared variable for input and output development data
        :type valid_shared_xy: tuple of shared variable
        :returns: finetune functions for training and development
        
        """

        (train_set_x, train_set_y, train_set_d, train_set_f) = train_shared_xydf
        (valid_set_x, valid_set_y, valid_set_d, valid_set_f) = valid_shared_xydf

        lr = T.scalar('lr', dtype = theano.config.floatX)
        mom = T.scalar('mom', dtype = theano.config.floatX)  # momentum

        cost = self.finetune_cost #+ self.L2_reg * self.L2_sqr

        params = self.params
        gparams = T.grad(cost, params)

        encoder_params = 0
        for layer in range(layer_index):
            encoder_params += len(self.rnn_layers[layer].params)

        # use optimizer
        if self.optimizer=='sgd':
            # zip just concatenate two lists
            updates = OrderedDict()

            for i, (param, gparam) in enumerate(zip(params, gparams)):
                weight_update = self.updates[param]
                if i >= encoder_params:
                    upd = mom * weight_update - lr * gparam
                else:
                    upd = mom * weight_update - (lr*2) * gparam
                updates[weight_update] = upd
                updates[param] = param + upd

        elif self.optimizer=='adam':
            updates = compile_ADAM_train_function(self, gparams, learning_rate=lr)
        elif self.optimizer=='rprop':
            updates = compile_RPROP_train_function(self, gparams)
        else: 
            logger.critical("This optimizer: %s is not supported right now! \n Please use one of the following: sgd, adam, rprop\n" %(self.optimizer))
            sys.exit(1)


        train_model = theano.function(inputs = [lr, mom],  
                                      outputs = self.errors,
                                      updates = updates,
                                      givens = {self.x: train_set_x, 
                                                self.y: train_set_y,
                                                self.d: train_set_d,
                                                self.f: train_set_f,
                                                self.is_train: np.cast['int32'](1)}, on_unused_input='ignore')


        valid_model = theano.function(inputs = [],
                                      outputs = self.errors,
                                      givens = {self.x: valid_set_x,
                                                self.y: valid_set_y,
                                                self.d: valid_set_d,
                                                self.f: valid_set_f,
                                                self.is_train: np.cast['int32'](0)}, on_unused_input='ignore')
 
        return  train_model, valid_model
Example #8
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, use_lhuc=False, layer_index=0):
        """ This function is to build finetune functions and to update gradients

        :param train_shared_xy: theano shared variable for input and output training data
        :type train_shared_xy: tuple of shared variable
        :param valid_shared_xy: theano shared variable for input and output development data
        :type valid_shared_xy: tuple of shared variable
        :returns: finetune functions for training and development

        """

        logger = logging.getLogger("DNN initialization")

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        lr = T.scalar('lr', dtype = theano.config.floatX)
        mom = T.scalar('mom', dtype = theano.config.floatX)  # momentum

        cost = self.finetune_cost #+ self.L2_reg * self.L2_sqr
        
        ## added for LHUC
        if use_lhuc:
            # In lhuc the parameters are only scaling parameters which have the name 'c'
            self.lhuc_params = []
            for p in self.params:
                if p.name == 'c':
                    self.lhuc_params.append(p)
            params = self.lhuc_params
            gparams = T.grad(cost, params)
        else:
            params = self.params
            gparams = T.grad(cost, params)


        freeze_params = 0
        for layer in range(layer_index):
            freeze_params += len(self.rnn_layers[layer].params)

        # use optimizer
        if self.optimizer=='sgd':
            # zip just concatenate two lists
            updates = OrderedDict()

            for i, (param, gparam) in enumerate(zip(params, gparams)):
                weight_update = self.updates[param]
                upd = mom * weight_update - lr * gparam
                updates[weight_update] = upd

                # freeze layers and update weights
                if i >= freeze_params:
                    updates[param] = param + upd

        elif self.optimizer=='adam':
            updates = compile_ADAM_train_function(self, gparams, learning_rate=lr)
        elif self.optimizer=='rprop':
            updates = compile_RPROP_train_function(self, gparams)
        else: 
            logger.critical("This optimizer: %s is not supported right now! \n Please use one of the following: sgd, adam, rprop\n" %(self.optimizer))
            sys.exit(1)

        train_model = theano.function(inputs = [lr, mom],  #index, batch_size
                                      outputs = self.errors,
                                      updates = updates,
                                      givens = {self.x: train_set_x, #[index*batch_size:(index + 1)*batch_size]
                                                self.y: train_set_y,
                                                self.is_train: np.cast['int32'](1)}, on_unused_input='ignore')


        valid_model = theano.function(inputs = [],
                                      outputs = self.errors,
                                      givens = {self.x: valid_set_x,
                                                self.y: valid_set_y,
                                                self.is_train: np.cast['int32'](0)}, on_unused_input='ignore')

        return  train_model, valid_model
Example #9
0
    def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size, \
                                                     return_valid_score_i=False):

        (train_set_x, train_set_y) = train_shared_xy
        (valid_set_x, valid_set_y) = valid_shared_xy

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.fscalar('learning_rate')
        momentum = T.fscalar('momentum')

        layer_size = len(self.params)
        lr_list = []
        for i in xrange(layer_size):
            lr_list.append(learning_rate)

        ##top 2 layers use a smaller learning rate
        ##hard-code now, change it later
        if layer_size > 4:
            for i in range(layer_size - 4, layer_size):
                lr_list[i] = learning_rate * 0.5

        # compute list of fine-tuning updates
        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        if self.use_rprop == 0:

            updates = theano.compat.python2x.OrderedDict()
            layer_index = 0
            for dparam, gparam in zip(self.delta_params, gparams):
                updates[
                    dparam] = momentum * dparam - gparam * lr_list[layer_index]
                layer_index += 1

            for dparam, param in zip(self.delta_params, self.params):
                updates[param] = param + updates[dparam]

            on_unused_input_value = 'raise'  ## Theano's default

        elif self.use_rprop:
            updates = compile_RPROP_train_function(self, gparams)
            on_unused_input_value = 'warn'

        ## Retain learning rate and momentum to make interface backwards compatible,
        ## even with RPROP where we don't use them, means we have to use on_unused_input='warn'.

        train_fn = theano.function(
            inputs=[
                index,
                theano.Param(learning_rate, default=0.125),
                theano.Param(momentum, default=0.5)
            ],
            outputs=self.errors,
            updates=updates,
            on_unused_input=on_unused_input_value,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        valid_fn = theano.function([],
                                   outputs=self.errors,
                                   givens={
                                       self.x: valid_set_x,
                                       self.y: valid_set_y
                                   })

        valid_score_i = theano.function(
            [index],
            outputs=self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        if return_valid_score_i:
            return train_fn, valid_fn, valid_score_i
        else:
            return train_fn, valid_fn