Beispiel #1
0
 def test_partition_function_factorize_v(self):
     sys.stdout.write(
         'RBM Estimator -> Performing partition_function_factorize_v test ...'
     )
     sys.stdout.flush()
     LogZ = Estimator.partition_function_factorize_v(
         self.bbrbm, beta=None, batchsize_exponent='AUTO', status=False)
     assert numx.all(numx.abs(LogZ - self.bbrbmTruelogZ) < self.epsilon)
     LogZ = Estimator.partition_function_factorize_v(self.bbrbm,
                                                     beta=None,
                                                     batchsize_exponent=0,
                                                     status=False)
     assert numx.all(numx.abs(LogZ - self.bbrbmTruelogZ) < self.epsilon)
     LogZ = Estimator.partition_function_factorize_v(self.bbrbm,
                                                     beta=None,
                                                     batchsize_exponent=3,
                                                     status=False)
     assert numx.all(numx.abs(LogZ - self.bbrbmTruelogZ) < self.epsilon)
     LogZ = Estimator.partition_function_factorize_v(self.bbrbm,
                                                     beta=None,
                                                     batchsize_exponent=555,
                                                     status=False)
     assert numx.all(numx.abs(LogZ - self.bbrbmTruelogZ) < self.epsilon)
     print(' successfully passed!')
     sys.stdout.flush()
    def _train(self,
               data,
               epsilon,
               k,
               momentum,
               reg_l1norm,
               reg_l2norm,
               reg_sparseness,
               desired_sparseness,
               update_visible_offsets,
               update_hidden_offsets,
               offset_typ,
               use_centered_gradient,
               restrict_gradient,
               restriction_norm,
               use_hidden_states):
        """ The training for one batch is performed using True Gradient (GD) for k Gibbs-sampling steps.

        :param data: The data used for training.
        :type data: numpy array [batch_size, input dimension]

        :param epsilon: The learning rate.
        :type epsilon: scalar or numpy array[num parameters] or numpy array[num parameters, parameter shape]

        :param k: NUmber of sampling steps.
        :type k: int

        :param momentum: The momentum term.
        :type momentum: scalar or numpy array[num parameters] or numpy array[num parameters, parameter shape]

        :param reg_l1norm: The parameter for the L1 regularization
        :type reg_l1norm: float

        :param reg_l2norm: The parameter for the L2 regularization also know as weight decay.
        :type reg_l2norm: float

        :param reg_sparseness: The parameter for the desired_sparseness regularization.
        :type reg_sparseness: None or float

        :param desired_sparseness: Desired average hidden activation or None for no regularization.
        :type desired_sparseness: None or float

        :param update_visible_offsets: The update step size for the models visible offsets.
        :type update_visible_offsets: float

        :param update_hidden_offsets: The update step size for the models hidden offsets.
        :type update_hidden_offsets: float

        :param offset_typ: | Different offsets can be used to center the gradient.<br />
                           | Example: 'DM' uses the positive phase visible mean and the negative phase hidden mean.
                           | 'A0' uses the average of positive and negative phase mean for visible, zero for the
                           | hiddens. Possible values are out of {A,D,M,0}x{A,D,M,0}
        :type offset_typ: string

        :param use_centered_gradient: Uses the centered gradient instead of centering.
        :type use_centered_gradient: bool

        :param restrict_gradient: If a scalar is given the norm of the weight gradient (along the input dim) is \
                                  restricted to stay below this value.
        :type restrict_gradient: None, float

        :param restriction_norm: Restricts the column norm, row norm or Matrix norm.
        :type restriction_norm: string, 'Cols','Rows', 'Mat'

        :param use_hidden_states: If True, the hidden states are used for the gradient calculations, the hiddens \
                                     probabilities otherwise.
        :type use_hidden_states: bool
        """
        # Sample the first time
        hid_probs_pos = self.model.probability_h_given_v(data)

        if update_visible_offsets != 0.0:
            xmean_pos = numx.mean(data, axis=0).reshape(1, self.model.input_dim)
        hmean_pos = 0.0
        if update_hidden_offsets != 0.0 or reg_sparseness != 0.0:
            if use_hidden_states:
                hid_states_pos = self.model.sample_h(hid_probs_pos)
                hmean_pos = numx.mean(hid_states_pos, axis=0).reshape(1, self.model.output_dim)
            else:
                hmean_pos = numx.mean(hid_probs_pos, axis=0).reshape(1, self.model.output_dim)

        # Calculate the partition function
        if self.model.input_dim < self.model.output_dim:
            batch_size = numx.min([self.model.input_dim, 12])
            ln_z = estimator.partition_function_factorize_v(self.model, beta=1.0, batchsize_exponent=batch_size,
                                                            status=False)
        else:
            batch_size = numx.min([self.model.output_dim, 12])
            ln_z = estimator.partition_function_factorize_h(self.model, beta=1.0, batchsize_exponent=batch_size,
                                                            status=False)

        # empty negative phase parts
        neg_gradients = [numx.zeros(self.model.w.shape),
                         numx.zeros(self.model.bv.shape),
                         numx.zeros(self.model.bh.shape)]

        # Calculate gradient stepwise in batches
        bit_length = self.model.input_dim

        batchsize = numx.power(2, batch_size)
        num_combinations = numx.power(2, bit_length)
        num_batches = num_combinations / batchsize

        for batch in range(0, num_batches):
            # Generate current batch
            bit_combinations = numxext.generate_binary_code(bit_length, batch_size, batch)
            # P(x)
            prob_x = numx.exp(
                self.model.log_probability_v(ln_z, bit_combinations))
            # P(h|x)
            prob_h_x = self.model.probability_h_given_v(bit_combinations)
            # Calculate gradient
            neg_gradients[1] += numx.sum(numx.tile(prob_x, (1, self.model.input_dim)) * (bit_combinations -
                                                                                         self.model.ov), axis=0)
            prob_x = (numx.tile(prob_x, (1, self.model.output_dim)) * (prob_h_x - self.model.oh))
            neg_gradients[0] += numx.dot((bit_combinations - self.model.ov).T, prob_x)
            neg_gradients[2] += numx.sum(prob_x, axis=0)

        if update_visible_offsets != 0.0 and (offset_typ[0] is 'A' or offset_typ[0] is 'M'):
            bit_combinations = numxext.generate_binary_code(self.model.input_dim, None, 0)
            prob_x = numx.exp(self.model.log_probability_v(ln_z, bit_combinations))
            xmean_neg = numx.sum(prob_x * bit_combinations, axis=0).reshape(1, self.model.input_dim)

        if update_hidden_offsets != 0.0 and (offset_typ[1] is 'A' or offset_typ[1] is 'M'):
            bit_combinations = numxext.generate_binary_code(self.model.output_dim, None, 0)
            prob_h = numx.exp(self.model.log_probability_h(ln_z, bit_combinations))
            hmean_neg = numx.sum(prob_h * bit_combinations, axis=0).reshape(1, self.model.output_dim)

        new_visible_offsets = 0.0
        if update_visible_offsets != 0.0:
            if offset_typ[0] is 'A':
                new_visible_offsets = (xmean_pos + xmean_neg) * 0.5
            if offset_typ[0] is 'D':
                new_visible_offsets = xmean_pos
            if offset_typ[0] is 'M':
                new_visible_offsets = xmean_neg
            if offset_typ[0] is '0':
                new_visible_offsets = 0.0 * xmean_pos
        new_hidden_offsets = 0.0
        if update_hidden_offsets != 0.0:
            if offset_typ[1] is 'A':
                new_hidden_offsets = (hmean_pos + hmean_neg) * 0.5
            if offset_typ[1] is 'D':
                new_hidden_offsets = hmean_pos
            if offset_typ[1] is 'M':
                new_hidden_offsets = hmean_neg
            if offset_typ[1] is '0':
                new_hidden_offsets = 0.0 * hmean_pos

        if use_centered_gradient is False:
            # update the centers
            self.model.update_offsets(new_visible_offsets, new_hidden_offsets, update_visible_offsets,
                                      update_hidden_offsets)
            self.visible_offsets = 0.0
            self.hidden_offsets = 0.0
        else:
            self.hidden_offsets = ((1.0 - update_hidden_offsets) * self.hidden_offsets + update_hidden_offsets
                                   * new_hidden_offsets)
            self.visible_offsets = ((1.0 - update_visible_offsets) * self.visible_offsets + update_visible_offsets
                                    * new_visible_offsets)

        # Calculate positive phase gradient using states or probabilities
        if use_hidden_states:
            pos_gradients = self.model.calculate_gradients(data, hid_states_pos)
        else:
            pos_gradients = self.model.calculate_gradients(data, hid_probs_pos)

        # Times batch size since adpat gradient devides by batchsize
        neg_gradients[0] *= data.shape[0]
        neg_gradients[1] *= data.shape[0]
        neg_gradients[2] *= data.shape[0]

        # Adapt the gradients by weight decay momentum and learning rate
        self._adapt_gradient(pos_gradients=pos_gradients,
                             neg_gradients=neg_gradients,
                             batch_size=data.shape[0],
                             epsilon=epsilon,
                             momentum=momentum,
                             reg_l1norm=reg_l1norm,
                             reg_l2norm=reg_l2norm,
                             reg_sparseness=reg_sparseness,
                             desired_sparseness=desired_sparseness,
                             mean_hidden_activity=hmean_pos,
                             visible_offsets=self.visible_offsets,
                             hidden_offsets=self.hidden_offsets,
                             use_centered_gradient=use_centered_gradient,
                             restrict_gradient=restrict_gradient,
                             restriction_norm=restriction_norm)

        # update the parameters with the calculated gradient
        self.model.update_parameters(self.parameter_updates)
# Measuring time
measurer = MEASURE.Stopwatch()

# Train model
print 'Training'
print 'Epoch\tRecon. Error\tLog likelihood \tExpected End-Time'
for epoch in range(1, epochs+1):
    train_data = numx.random.permutation(train_data)
    for b in range(0, train_data.shape[0], batch_size):
        batch = train_data[b:b + batch_size, :]
        trainer.train(data=batch, epsilon = 0.01, k = 10, regL2Norm = 0.001)

    # Calculate Log-Likelihood, reconstruction error and expected end time every 5th epoch
    if (epoch % 1000 == 0):
        Z = ESTIMATOR.partition_function_factorize_v(rbm)
        LL = numx.mean(ESTIMATOR.log_likelihood_v(rbm, Z, train_data))
        RE = numx.mean(ESTIMATOR.reconstruction_error(rbm, train_data))
        print '%d\t\t%8.6f\t\t%8.4f\t\t' % (epoch, RE, LL),
        print measurer.get_expected_end_time(epoch , epochs),
        print

measurer.end()

# Print end time
print
print 'End-time: \t', measurer.get_end_time()
print 'Training time:\t', measurer.get_interval()

# Calculate and approximate partition function
Z = ESTIMATOR.partition_function_factorize_v(rbm, batchsize_exponent=h1, status=False)