def train(self, sequences): """Implements optimize(..) from assignment. This trains a hid_to_class. Notes: * This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping. Args: model_shape (tuple) : is the shape of the array of weights. gradient_function : a function that takes parameters <model> and <data> and returns the gradient (or approximate gradient in the case of CD-1) of the function that we're maximizing. Note the contrast with the loss function that we saw in PA3, which we were minimizing. The returned gradient is an array of the same shape as the provided <model> parameter. Returns: (numpy.array) : matrix of weights of the trained model (hid_to_class) """ self.reset_classifier() # calculate the hidden layer representation of the labeled data, rbm_w is input_to_hid hidden_representation = logistic( np.dot(self.rbm_w, sequences['inputs'])) momentum_speed = np.zeros(self.model_shape) for i, (mini_batch_x, mini_batch_y) in enumerate( zip(batches(hidden_representation, self.mini_batch_size), batches(sequences['targets'], self.mini_batch_size))): if i >= self.n_iterations: break self.fit(mini_batch_x, mini_batch_y) momentum_speed = self.train_momentum * momentum_speed + self.d_phi_by_d_input_to_class self.model += momentum_speed * self.lr_net
def predict_proba(self, inputs): """Predict the probability of each class given data inputs. Returns: (numpy.array) : probability of classes """ hid_input = np.dot(self.model['inputToHid'], inputs) hid_output = logistic( hid_input ) # size: <number of hidden units> by <number of data cases> return np.dot(self.model['hidToClass'], hid_output)
def hidden_state_to_visible_probabilities(rbm_w, hidden_state): """This takes in the (binary) states of the hidden units, and returns the activation probabilities of the visible units, conditional on those states. Args: rbm_w (numpy.array) : a matrix of size <number of hidden units> by <number of visible units> hidden_state (numpy.array) : is a binary matrix of size <number of hidden units> by <number of configurations that we're handling in parallel>. Returns: (numpy.array) : Activation probabilities of visible units. size <number of visible units> by <number of configurations that we're handling in parallel>. """ return logistic(np.dot(rbm_w.T, hidden_state))
def _d_loss_by_d_model(self, inputs, targets): """Compute derivative of loss. Args: data (dict): - 'inputs' is a matrix of size <number of inputs i.e. NUM_INPUT_UNITS> by <number of data cases> - 'targets' is a matrix of size <number of classes i.e. NUM_CLASSES> by <number of data cases> Returns: dict: The returned object is supposed to be exactly like parameter <model>, i.e. it has fields ret['inputToHid'] and ret['hidToClass']. However, the contents of those matrices are gradients (d loss by d model parameter), instead of model parameters. """ ret_model = dict() # First, feed forward the values, capture the weight input's (class_input and hid_input) and # activations (class_output and hid_output) at every layer. hid_input = np.dot(self.model['inputToHid'], inputs) hid_output = logistic(hid_input) class_input = np.dot(self.model['hidToClass'], hid_output) class_prob = np.exp(self.predict_log_proba(class_input)) # Now, back propagate. Compute the delta error (error_deriv) for the output layer (the third layer). error_deriv = class_prob - targets # Compute the gradient for the output layer across all training examples then divide # across the training set size for each weight gradient. hid_to_output_weights_gradient = np.dot( hid_output, error_deriv.T) / float(np.size(hid_output, axis=1)) ret_model['hidToClass'] = hid_to_output_weights_gradient.T # Compute the delta error (backpropagate_error_deriv) for the hidden layer. backpropagate_error_deriv = np.dot(self.model['hidToClass'].T, error_deriv) # Compute the gradient for the hidden layer across all training examples then divide # across the training set size for each weight gradient. input_to_hidden_weights_gradient = np.dot(inputs, ( (1.0 - hid_output) * hid_output * backpropagate_error_deriv).T) / float(np.size(hid_output, axis=1)) ret_model['inputToHid'] = input_to_hidden_weights_gradient.T # Add in the weight decay. ret_model['inputToHid'] += self.model['inputToHid'] * self.wd_coeff ret_model['hidToClass'] += self.model['hidToClass'] * self.wd_coeff self.gradient = self.model_to_theta(ret_model)