def update_weights(self, weight_updates, restriction, restriction_typ): ''' This function updates the weight parameters. :Parameters: weight_updates: Update for the weight parameter. -type: numpy array [input dim, output dim] restriction: If a scalar is given the weights will be forced after an update not to exceed this value. restriction_typ controls how the values are restricted. -type: scalar or None restriction_typ: If a value for the restriction is given, this parameter determines the restriction typ. 'Cols', 'Rows', 'Mat' or 'Abs' to restricted the colums, rows or matrix norm or the matrix absolut values. -type: string ''' # Update weights self.orginalW += weight_updates # Restricts the gradient if numx.isscalar(restriction): if restriction > 0: if restriction_typ is 'Cols': self.orginalW = numxExt.restrict_norms( self.orginalW, restriction, 0) if restriction_typ is 'Rows': self.orginalW = numxExt.restrict_norms( self.orginalW, restriction, 1) if restriction_typ is 'Mat': self.orginalW = numxExt.restrict_norms( self.orginalW, restriction, None) if restriction_typ is 'Val': numx.clip(self.orginalW, -restriction, restriction, self.orginalW) self.weights = self._convolve(self.orginalW, self.mask)
def update_biases(self, bias_updates, restriction, restriction_typ): ''' This function updates the weight parameter. :Parameters: b_updates: Update for the bias parameter. -type: numpy array [1, input dim] ''' # Restricts the gradient if numx.isscalar(restriction): if restriction > 0: if restriction_typ is 'Cols' or restriction_typ is 'Rows' or restriction_typ is 'Mat': bias_updates = numxExt.restrict_norms( bias_updates, restriction) if restriction_typ is 'Val': numx.clip(bias_updates, -restriction, restriction, bias_updates) self.bias += bias_updates
def _adapt_gradient(self, pos_gradients, neg_gradients, batch_size, epsilon, momentum, regL1Norm, regL2Norm, regSparseness, desired_sparseness, mean_hidden_activity, visible_offsets, hidden_offsets, use_centered_gradient, restrict_gradient, restriction_norm): ''' This function updates the parameter gradients. :Parameters: pos_gradients: Positive Gradients. -type: numpy array[parameter index, parameter shape] neg_gradients: Negative Gradients. -type: numpy array[parameter index, parameter shape] batch_size: The batch_size of the data. -type: float epsilon: The learning rate. -type: numpy array[num parameters] momentum: The momentum term. -type: numpy array[num parameters] regL1Norm: The parameter for the L1 regularization -type: float regL2Norm: The parameter for the L2 regularization, also know as weight decay. -type: float regSparseness: The parameter for the desired_sparseness. regularization. -type: None or float desired_sparseness: Desired average hidden activation or None for no regularization. -type: None or float mean_hidden_activity: Average hidden activation <P(h_i=1|x)>_h_i -type: numpy array [num samples] visible_offsets: If not zero the gradient is centered around this value. -type: float hidden_offsets: If not zero the gradient is centered around this value. -type: float use_centered_gradient: Uses the centered gradient instead of centering. -type: bool restrict_gradient: If a scalar is given the norm of the weight gradient (along the input dim) is restricted to stay below this value. -type: None, float restriction_norm: restricts the column norm, row norm or Matrix norm. -type: string: 'Cols','Rows', 'Mat' ''' # calculate normal gradient gradients = [] for i in range(self.num_parameters): gradients.append((pos_gradients[i] - neg_gradients[i])/batch_size) # adapt to centered gradient if use_centered_gradient: gradients = self._calculate_centered_gradient(gradients, visible_offsets, hidden_offsets) # adapt parameters for i in range(self.num_parameters): self.parameter_updates[i] *= momentum[i] self.parameter_updates[i] += epsilon[i] * gradients[i] # Add sparse penalty if regSparseness != 0: if desired_sparseness is not None: self.parameter_updates[2] += (epsilon[2] * regSparseness * ( desired_sparseness - mean_hidden_activity)) #st = numx.clip(mean_hidden_activity,0.001,0.999) #st = -desired_sparseness/st+(1.0-desired_sparseness)/(1.0-st) #self.parameter_updates[2] -= epsilon[2] * regSparseness * st # add weight decay if regL1Norm != 0: self.parameter_updates[0] -= (epsilon[0] * regL1Norm * numx.sign(self.model.w)) if regL2Norm != 0: self.parameter_updates[0] -= (epsilon[0] * regL2Norm * self.model.w) # Restricts the gradient if numx.isscalar(restrict_gradient): if restrict_gradient > 0: if restriction_norm == 'Cols': self.parameter_updates[0] = npExt.restrict_norms( self.parameter_updates[0], restrict_gradient, 0 ) if restriction_norm == 'Rows': self.parameter_updates[0] = npExt.restrict_norms( self.parameter_updates[0], restrict_gradient, 1 ) if restriction_norm == 'Mat': self.parameter_updates[0] = npExt.restrict_norms( self.parameter_updates[0], restrict_gradient, None)
def _adapt_gradient(self, pos_gradients, neg_gradients, batch_size, epsilon, momentum, reg_l1norm, reg_l2norm, reg_sparseness, desired_sparseness, mean_hidden_activity, visible_offsets, hidden_offsets, use_centered_gradient, restrict_gradient, restriction_norm): """ This function updates the parameter gradients. :param pos_gradients: Positive Gradients. :type pos_gradients: numpy array[parameter index, parameter shape] :param neg_gradients: Negative Gradients. :type neg_gradients: numpy array[parameter index, parameter shape] :param batch_size: The batch_size of the data. :type batch_size: float :param epsilon: The learning rate. :type epsilon: numpy array[num parameters] :param momentum: The momentum term. :type momentum: numpy array[num parameters] :param reg_l1norm: The parameter for the L1 regularization :type reg_l1norm: float :param reg_l2norm: The parameter for the L2 regularization also know as weight decay. :type reg_l2norm: float :param reg_sparseness: The parameter for the desired_sparseness regularization. :type reg_sparseness: None or float :param desired_sparseness: Desired average hidden activation or None for no regularization. :type desired_sparseness: None or float :param mean_hidden_activity: Average hidden activation <P(h_i=1|x)>_h_i :type mean_hidden_activity: numpy array [num samples] :param visible_offsets: If not zero the gradient is centered around this value. :type visible_offsets: float :param hidden_offsets: If not zero the gradient is centered around this value. :type hidden_offsets: float :param use_centered_gradient: Uses the centered gradient instead of centering. :type use_centered_gradient: bool :param restrict_gradient: If a scalar is given the norm of the weight gradient (along the input dim) is \ restricted to stay below this value. :type restrict_gradient: None, float :param restriction_norm: Restricts the column norm, row norm or Matrix norm. :type restriction_norm: string, 'Cols','Rows', 'Mat' """ # calculate normal gradient gradients = [] for i in range(self.num_parameters): gradients.append((pos_gradients[i] - neg_gradients[i]) / batch_size) # adapt to centered gradient if use_centered_gradient: gradients = self._calculate_centered_gradient(gradients, visible_offsets, hidden_offsets) # adapt parameters for i in range(self.num_parameters): self.parameter_updates[i] *= momentum[i] self.parameter_updates[i] += epsilon[i] * gradients[i] # Add sparse penalty if reg_sparseness != 0: if desired_sparseness is not None: self.parameter_updates[2] += (epsilon[2] * reg_sparseness * (desired_sparseness - mean_hidden_activity)) # st = numx.clip(mean_hidden_activity,0.001,0.999) # st = -desired_sparseness/st+(1.0-desired_sparseness)/(1.0-st) # self.parameter_updates[2] -= epsilon[2] * reg_sparseness * st # add weight decay if reg_l1norm != 0: self.parameter_updates[0] -= (epsilon[0] * reg_l1norm * numx.sign(self.model.w)) if reg_l2norm != 0: self.parameter_updates[0] -= (epsilon[0] * reg_l2norm * self.model.w) # Restricts the gradient if numx.isscalar(restrict_gradient): if restrict_gradient > 0: if restriction_norm is 'Cols': self.parameter_updates[0] = numxext.restrict_norms(self.parameter_updates[0], restrict_gradient, 0) if restriction_norm is 'Rows': self.parameter_updates[0] = numxext.restrict_norms(self.parameter_updates[0], restrict_gradient, 1) if restriction_norm is 'Mat': self.parameter_updates[0] = numxext.restrict_norms(self.parameter_updates[0], restrict_gradient, None)
def _train(self, data, epsilon, momentum, update_visible_offsets, update_hidden_offsets, corruptor, reg_L1Norm, reg_L2Norm, reg_sparseness, desired_sparseness, reg_contractive, reg_slowness, data_next, restrict_gradient, restriction_norm): ''' The training for one batch is performed using gradient descent. :Parameters: data: The training data -type: numpy array [num samples, input dim] epsilon: The learning rate. -type: numpy array[num parameters] momentum: The momentum term. -type: numpy array[num parameters] update_visible_offsets: The update step size for the models visible offsets. Good value if functionality is used: 0.001 -type: float update_hidden_offsets: The update step size for the models hidden offsets. Good value if functionality is used: 0.001 -type: float corruptor: Defines if and how the data gets corrupted. (e.g. Gauss noise, dropout, Max out) -type: corruptor reg_L1Norm: The parameter for the L1 regularization -type: float reg_L2Norm: The parameter for the L2 regularization, also know as weight decay. -type: float reg_sparseness: The parameter (epsilon) for the sparseness regularization. -type: float desired_sparseness: Desired average hidden activation. -type: float reg_contractive: The parameter (epsilon) for the contractive regularization. -type: float reg_slowness: The parameter (epsilon) for the slowness regularization. -type: float data_next: The next training data in the sequence. -type: numpy array [num samples, input dim] restrict_gradient: If a scalar is given the norm of the weight gradient is restricted to stay below this value. -type: None, float restriction_norm: restricts the column norm, row norm or Matrix norm. -type: string: 'Cols','Rows', 'Mat' ''' x_next = None h_next = None a_h_next = None #orginal_h = None # Forward propagation, if corruptor is given the data is corrupted if corruptor == None: x = data x_next = data_next a_h,h = self.model._encode(x) #orginal_h = h a_y,y = self.model._decode(h) if reg_slowness > 0.0 and data_next is not None: a_h_next,h_next = self.model._encode(x_next) else: #_,orginal_h = self.model._encode(data) if isinstance(corruptor, list): x = corruptor[0].corrupt(data) a_h,h = self.model._encode(x) h = corruptor[1].corrupt(h) a_y,y = self.model._decode(h) y = corruptor[2].corrupt(y) if reg_slowness > 0.0 and data_next != None: x_next = corruptor[0].corrupt(data_next) a_h_next,h_next = self.model._encode(x_next) else: x = corruptor.corrupt(data) a_h,h = self.model._encode(x) h = corruptor.corrupt(h) a_y,y = self.model._decode(h) y = corruptor.corrupt(y) if reg_slowness > 0.0 and data_next != None: x_next = corruptor.corrupt(data_next) a_h_next,h_next = self.model._encode(x_next) # Update offsets mean_h = 0.0 mean_x = 0.0 if update_visible_offsets > 0.0: mean_x = numx.mean(x,axis=0).reshape(1,self.model.input_dim) if update_hidden_offsets > 0.0: mean_h = numx.mean(h,axis=0).reshape(1,self.model.output_dim) self.model.update_offsets(mean_x, mean_h, update_visible_offsets, update_hidden_offsets) # Get the gradients for the model gradients = self.model._get_gradients(data, a_h, h, a_y, y, reg_contractive, reg_sparseness, desired_sparseness, reg_slowness, x_next, a_h_next, h_next) # adapt parameters for i in range(self.num_parameters): self.parameter_updates[i] *= momentum[i] self.parameter_updates[i] -= epsilon[i] * gradients[i] # add weight decay L1 norm if reg_L1Norm != 0: self.parameter_updates[0] -= (epsilon[0] * reg_L1Norm * numx.sign(self.model.w)) # add weight decay L2 norm if reg_L2Norm != 0: self.parameter_updates[0] -= (epsilon[0] * reg_L2Norm * self.model.w) # Restricts the gradient if numx.isscalar(restrict_gradient): if restrict_gradient > 0: if restriction_norm is 'Cols': typ = 0 if restriction_norm is 'Rows': typ = 1 if restriction_norm is 'Mat': typ = None self.parameter_updates[0] = npExt.restrict_norms(self.parameter_updates[0], restrict_gradient, typ ) # update the parameters with the calculated gradient self.model.update_parameters(self.parameter_updates)
def train(self, data, labels, costs, reg_costs, epsilon, update_offsets, corruptor, reg_L1Norm, reg_L2Norm, reg_sparseness, desired_sparseness, costs_sparseness, restrict_gradient, restriction_norm): ''' Train function which performes one step of gradient descent. Use check_setup() to check whether your training setup is valid. :Parameters: data: Training data as numpy array. -type: numpy arrays [batchsize, inpput dim] labels: List of numpy arrays or None if a layer has no cost, the last layer has to have a cost and thus the last item in labels has to be an array. -type: List of numpy arrays and/or Nones costs: List of Cost functions. The last layer has to have a cost. -type: pydeep.base.costfunction reg_costs: List of scalars controlling the strength of the cost functions. Last entry i.e. 1. -type: scalar epsilon: List of Learning rates. -type: list of scalars update_offsets: List of Shifting factors for centering. -type: list of scalars corruptor: List of Corruptor objects e.g. Dropout. -type: list of pydeep.base.corruptors reg_L1Norm: List of L1 Norm Regularization terms. -type: list of scalars reg_L2Norm: List of L2 Norm Regularization terms. -type: list of scalars reg_sparseness: List of scalars controlling the strength of the sparseness regularization. -type: list of scalars desired_sparseness: List of scalars / target sparseness. -type: list of scalars costs_sparseness: List of sparseness cost and/or None values -type: list of pydeep.base.costfunction and/or None restrict_gradient: Maximal norm for the gradient or None -type: list of scalars restriction_norm: Defines how the weights will be restricted 'Cols', 'Rows' or 'Mat'. -type: Strings 'Cols', 'Rows' or 'Mat' ''' # Forward propagate through the entire network, possibly use corrupter states output = self.model.forward_propagate(data=data, corruptor=corruptor) # Reparameterize the network to the new mean - Update all offests and biases for l in range(len(self.model.layers)): self.model.layers[l].update_offsets(shift=update_offsets[l], new_mean=None) deltas = None # Go from top layer to last layer for l in range(self.model.num_layers - 1, -1, -1): # caluclate the delta values deltas = self.model.layers[l]._get_deltas( deltas=deltas, labels=labels[l], cost=costs[l], reg_cost=reg_costs[l], desired_sparseness=desired_sparseness[l], cost_sparseness=costs_sparseness[l], reg_sparseness=reg_sparseness[l]) # backprop the error if it is not first/bottom most layer. if l > 0: deltas = self.model.layers[l]._backward_propagate() # Now we are ready to calculate the gradient grad = self.model.layers[l]._calculate_gradient() # Possibly add weight decay terms if reg_L1Norm[l] > 0.0: grad[0] += (reg_L1Norm[l] * numx.sign(self.model.layers[l].weights)) if reg_L2Norm[l] > 0.0: grad[0] += (reg_L2Norm[l] * self.model.layers[l].weights) # Apply learning rate ny ADA rule self._old_grad[l][0] += grad[0]**2 self._old_grad[l][1] += grad[1]**2 grad[0] /= (self._numerical_stabilty + numx.sqrt(self._old_grad[l][0])) grad[1] /= (self._numerical_stabilty + numx.sqrt(self._old_grad[l][1])) grad[0] *= epsilon[l] grad[1] *= epsilon[l] # Restricts the gradient is desired if numx.isscalar(restrict_gradient): if restrict_gradient > 0: if restriction_norm is 'Cols': grad[0] = numxExt.restrict_norms( grad[0], restrict_gradient, 0) if restriction_norm is 'Rows': grad[0] = numxExt.restrict_norms( grad[0], restrict_gradient, 1) if restriction_norm is 'Mat': grad[0] = numxExt.restrict_norms( grad[0], restrict_gradient, None) # Update the model parameters self.model.layers[l].update_parameters([grad[0], grad[1]])