Example #1
0
    def train(self, sequences):
        """Implements optimize(..) from assignment. This trains a hid_to_class.

        Notes:
        * This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.

        Args:
            model_shape (tuple) : is the shape of the array of weights.
            gradient_function   : a function that takes parameters <model> and <data> and returns the gradient
                (or approximate gradient in the case of CD-1) of the function that we're maximizing.
                Note the contrast with the loss function that we saw in PA3, which we were minimizing.
                The returned gradient is an array of the same shape as the provided <model> parameter.

        Returns:
            (numpy.array) : matrix of weights of the trained model (hid_to_class)
        """
        self.reset_classifier()
        # calculate the hidden layer representation of the labeled data, rbm_w is input_to_hid
        hidden_representation = logistic(
            np.dot(self.rbm_w, sequences['inputs']))
        momentum_speed = np.zeros(self.model_shape)
        for i, (mini_batch_x, mini_batch_y) in enumerate(
                zip(batches(hidden_representation, self.mini_batch_size),
                    batches(sequences['targets'], self.mini_batch_size))):
            if i >= self.n_iterations:
                break
            self.fit(mini_batch_x, mini_batch_y)
            momentum_speed = self.train_momentum * momentum_speed + self.d_phi_by_d_input_to_class
            self.model += momentum_speed * self.lr_net
    def _d_loss_by_d_model(self, inputs, targets):
        """Compute derivative of loss.
        Args:
            data (dict):
                    - 'inputs' is a matrix of size <number of inputs i.e. NUM_INPUT_UNITS> by <number of data cases>
                    - 'targets' is a matrix of size <number of classes i.e. NUM_CLASSES> by <number of data cases>

        Returns:
            dict:   The returned object is supposed to be exactly like parameter <model>,
                    i.e. it has fields ret['inputToHid'] and ret['hidToClass'].
                    However, the contents of those matrices are gradients (d loss by d model parameter),
                    instead of model parameters.
        """
        ret_model = dict()

        hid_input = np.dot(self.model['inputToHid'], inputs)
        hid_output = logistic(hid_input)
        class_input = np.dot(self.model['hidToClass'], hid_output)
        class_prob = np.exp(self.predict_log_proba(class_input))

        # weight decay loss. very straightforward: E = 1/2 * wd_coeffecient * theta^2
        error_deriv = class_prob - targets
        hid_to_output_weights_gradient = np.dot(hid_output, error_deriv.T) / float(np.size(hid_output, axis=1))
        ret_model['hidToClass'] = hid_to_output_weights_gradient.T

        backpropagate_error_deriv = np.dot(self.model['hidToClass'].T, error_deriv)
        input_to_hidden_weights_gradient = np.dot(inputs, ((1.0 - hid_output) * hid_output *
                                                           backpropagate_error_deriv).T) / float(np.size(hid_output,
                                                                                                         axis=1))
        ret_model['inputToHid'] = input_to_hidden_weights_gradient.T

        ret_model['inputToHid'] += self.model['inputToHid'] * self.wd_coeff
        ret_model['hidToClass'] += self.model['hidToClass'] * self.wd_coeff
        self.gradient = self.model_to_theta(ret_model)
    def train(self, sequences):
        """Implements optimize(..) from assignment. This trains a hid_to_class.

        Notes:
        * This uses mini-batches of size 100, momentum of 0.9, no weight decay, and no early stopping.

        Args:
            model_shape (tuple) : is the shape of the array of weights.
            gradient_function   : a function that takes parameters <model> and <data> and returns the gradient
                (or approximate gradient in the case of CD-1) of the function that we're maximizing.
                Note the contrast with the loss function that we saw in PA3, which we were minimizing.
                The returned gradient is an array of the same shape as the provided <model> parameter.

        Returns:
            (numpy.array) : matrix of weights of the trained model (hid_to_class)
        """
        self.reset_classifier()
        # calculate the hidden layer representation of the labeled data, rbm_w is input_to_hid
        hidden_representation = logistic(np.dot(self.rbm_w, sequences['inputs']))
        momentum_speed = np.zeros(self.model_shape)
        for i, (mini_batch_x, mini_batch_y) in enumerate(zip(batches(hidden_representation, self.mini_batch_size),
                                                             batches(sequences['targets'], self.mini_batch_size))):
            if i >= self.n_iterations:
                break
            self.fit(mini_batch_x, mini_batch_y)
            momentum_speed = self.train_momentum * momentum_speed + self.d_phi_by_d_input_to_class
            self.model += momentum_speed * self.lr_net
    def predict_proba(self, inputs):
        """Predict the probability of each class given data inputs.

        Returns:
            (numpy.array) : probability of classes
        """
        hid_input = np.dot(self.model['inputToHid'], inputs)
        hid_output = logistic(hid_input)  # size: <number of hidden units> by <number of data cases>
        return np.dot(self.model['hidToClass'], hid_output)
    def predict_proba(self, inputs):
        """Predict the probability of each class given data inputs.

        Returns:
            (numpy.array) : probability of classes
        """
        hid_input = np.dot(self.model['inputToHid'], inputs)
        hid_output = logistic(
            hid_input
        )  # size: <number of hidden units> by <number of data cases>
        return np.dot(self.model['hidToClass'], hid_output)
Example #6
0
    def hidden_state_to_visible_probabilities(rbm_w, hidden_state):
        """This takes in the (binary) states of the hidden units, and returns the activation probabilities
         of the visible units, conditional on those states.

        Args:
            rbm_w (numpy.array)         : a matrix of size <number of hidden units> by <number of visible units>
            hidden_state (numpy.array)  : is a binary matrix of size <number of hidden units> by <number of
                                          configurations that we're handling in parallel>.

        Returns:
            (numpy.array)   : Activation probabilities of visible units. size <number of visible units> by
                              <number of configurations that we're handling in parallel>.
        """
        return logistic(np.dot(rbm_w.T, hidden_state))
    def hidden_state_to_visible_probabilities(rbm_w, hidden_state):
        """This takes in the (binary) states of the hidden units, and returns the activation probabilities
         of the visible units, conditional on those states.

        Args:
            rbm_w (numpy.array)         : a matrix of size <number of hidden units> by <number of visible units>
            hidden_state (numpy.array)  : is a binary matrix of size <number of hidden units> by <number of
                                          configurations that we're handling in parallel>.

        Returns:
            (numpy.array)   : Activation probabilities of visible units. size <number of visible units> by
                              <number of configurations that we're handling in parallel>.
        """
        return logistic(np.dot(rbm_w.T, hidden_state))
    def _d_loss_by_d_model(self, inputs, targets):
        """Compute derivative of loss.
        Args:
            data (dict):
                    - 'inputs' is a matrix of size <number of inputs i.e. NUM_INPUT_UNITS> by <number of data cases>
                    - 'targets' is a matrix of size <number of classes i.e. NUM_CLASSES> by <number of data cases>

        Returns:
            dict:   The returned object is supposed to be exactly like parameter <model>,
                    i.e. it has fields ret['inputToHid'] and ret['hidToClass'].
                    However, the contents of those matrices are gradients (d loss by d model parameter),
                    instead of model parameters.
        """
        ret_model = dict()

        hid_input = np.dot(self.model['inputToHid'], inputs)
        hid_output = logistic(hid_input)
        class_input = np.dot(self.model['hidToClass'], hid_output)
        class_prob = np.exp(self.predict_log_proba(class_input))

        # weight decay loss. very straightforward: E = 1/2 * wd_coeffecient * theta^2
        error_deriv = class_prob - targets
        hid_to_output_weights_gradient = np.dot(
            hid_output, error_deriv.T) / float(np.size(hid_output, axis=1))
        ret_model['hidToClass'] = hid_to_output_weights_gradient.T

        backpropagate_error_deriv = np.dot(self.model['hidToClass'].T,
                                           error_deriv)
        input_to_hidden_weights_gradient = np.dot(inputs, (
            (1.0 - hid_output) * hid_output *
            backpropagate_error_deriv).T) / float(np.size(hid_output, axis=1))
        ret_model['inputToHid'] = input_to_hidden_weights_gradient.T

        ret_model['inputToHid'] += self.model['inputToHid'] * self.wd_coeff
        ret_model['hidToClass'] += self.model['hidToClass'] * self.wd_coeff
        self.gradient = self.model_to_theta(ret_model)