Beispiel #1
0
    def init_train_updates(self):
        updates = super(LeakStepAdaptation, self).init_train_updates()

        alpha = self.alpha
        beta = self.beta
        leak_size = self.leak_size

        step = self.variables.step
        leak_average = self.variables.leak_average

        parameters = list(iter_parameters(self))
        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        leak_avarage_update = (
            (1 - leak_size) * leak_average + leak_size * full_gradient
        )
        new_step = step + alpha * step * (
            beta * leak_avarage_update.norm(L=2) - step
        )

        updates.extend([
            (leak_average, leak_avarage_update),
            (step, new_step),
        ])

        return updates
Beispiel #2
0
    def init_param_updates(self, layer, parameter):
        epoch = self.variables.epoch
        prev_first_moment = parameter.prev_first_moment
        prev_weighted_inf_norm = parameter.prev_weighted_inf_norm

        step = self.variables.step
        beta1 = self.beta1
        beta2 = self.beta2

        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(
            value=asfloat(np.zeros((n_parameters, n_parameters))),
            name='hessian_inverse')
        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters
        )

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        first_moment = beta1 * prev_first_moment + (1 - beta1) * gradient
        weighted_inf_norm = T.maximum(beta2 * prev_weighted_inf_norm,
                                      T.abs_(gradient))

        parameter_delta = (
            (1 / (1 - beta1 ** epoch)) *
            (first_moment / (weighted_inf_norm + self.epsilon))
        )

        return [
            (prev_first_moment, first_moment),
            (prev_weighted_inf_norm, weighted_inf_norm),
            (parameter, parameter - step * parameter_delta),(self.variables.hessian, hessian_matrix)
        ]
Beispiel #3
0
    def init_train_updates(self):
        updates = super(LeakStepAdaptation, self).init_train_updates()

        alpha = self.alpha
        beta = self.beta
        leak_size = self.leak_size

        step = self.variables.step
        leak_average = self.variables.leak_average

        parameters = list(iter_parameters(self))
        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        leak_avarage_update = ((1 - leak_size) * leak_average +
                               leak_size * full_gradient)
        new_step = step + alpha * step * (
            beta * leak_avarage_update.norm(L=2) - step)

        updates.extend([
            (leak_average, leak_avarage_update),
            (step, new_step),
        ])

        return updates
Beispiel #4
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean(
            (network_output - prediction_func) ** 2,
            axis=1
        )

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)
        ).dot(J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Beispiel #5
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean((network_output - prediction_func)**2,
                                     axis=1)

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)).dot(
                J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Beispiel #6
0
    def init_train_updates(self):
        step = self.variables.step
        min_eigval = self.min_eigval
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative = T.grad(gradient.sum(), wrt=parameter)
            second_derivatives.append(second_derivative.flatten())

        hessian_diag = T.concatenate(second_derivatives)
        hessian_diag = T.switch(
            T.abs_(hessian_diag) < min_eigval,
            T.switch(
                hessian_diag < 0,
                -min_eigval,
                min_eigval,
            ), hessian_diag)

        # We divide gradient by Hessian diagonal elementwise is the same
        # as we just took diagonal Hessian inverse (which is
        # reciprocal for each diagonal element) and mutliply
        # by gradient. This operation is less clear, but works faster.
        updated_parameters = (param_vector -
                              step * full_gradient / hessian_diag)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Beispiel #7
0
    def init_train_updates(self):
        step = self.variables.step
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        beta = self.update_function(previous_gradient, full_gradient,
                                    previous_delta)
        parameter_delta = ifelse(
            T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient,
            -full_gradient + beta * previous_delta)
        updated_parameters = param_vector + step * parameter_delta

        updates = [
            (previous_gradient, full_gradient),
            (previous_delta, parameter_delta),
        ]
        parameter_updates = setup_parameter_updates(parameters,
                                                    updated_parameters)
        updates.extend(parameter_updates)

        return updates
Beispiel #8
0
    def init_param_updates(self, layer, parameter):
        epoch = self.variables.epoch
        prev_first_moment = parameter.prev_first_moment
        prev_second_moment = parameter.prev_second_moment
        #        step = asfloat(self.variables.step)
        step = 0.001
        beta1 = asfloat(self.beta1)
        beta2 = asfloat(self.beta2)
        epsilon = asfloat(self.epsilon)

        gradient = T.grad(self.variables.error_func, wrt=parameter)

        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')
        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        first_moment = (beta1 * prev_first_moment +
                        asfloat(1. - beta1) * gradient)
        second_moment = (beta2 * prev_second_moment +
                         asfloat(1. - beta2) * gradient**2)

        first_moment_bias_corrected = first_moment / (1. - beta1**epoch)
        second_moment_bias_corrected = second_moment / (1. - beta2**epoch)

        parameter_delta = first_moment_bias_corrected * (
            T.sqrt(second_moment_bias_corrected) + epsilon)

        return [(prev_first_moment, first_moment),
                (prev_second_moment, second_moment),
                (parameter, parameter - step * parameter_delta),
                (self.variables.hessian, hessian_matrix)]
Beispiel #9
0
    def init_train_updates(self):
        step = self.variables.step
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        beta = self.update_function(previous_gradient, full_gradient,
                                    previous_delta)
        parameter_delta = ifelse(
            T.eq(T.mod(self.variables.epoch, n_parameters), 1),
            -full_gradient,
            -full_gradient + beta * previous_delta
        )
        updated_parameters = param_vector + step * parameter_delta

        updates = [
            (previous_gradient, full_gradient),
            (previous_delta, parameter_delta),
        ]
        parameter_updates = setup_parameter_updates(parameters,
                                                    updated_parameters)
        updates.extend(parameter_updates)

        return updates
Beispiel #10
0
    def init_train_updates(self):
        network_input = self.variables.network_input
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1),
            inv_hessian,
            self.update_function(inv_hessian,
                                 param_vector - prev_params,
                                 full_gradient - prev_full_gradient)
        )
        param_delta = -new_inv_hessian.dot(full_gradient)

        def prediction(step):
            # TODO: I need to update this ugly solution later
            updated_params = param_vector + step * param_delta

            layer_input = network_input
            start_pos = 0
            for layer in self.layers:
                for param in layer.parameters:
                    end_pos = start_pos + param.size
                    parameter_name, parameter_id = param.name.split('_')
                    setattr(layer, parameter_name, T.reshape(
                        updated_params[start_pos:end_pos],
                        param.shape
                    ))
                    start_pos = end_pos
                layer_input = layer.output(layer_input)
            return layer_input

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
Beispiel #11
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean((network_output - prediction_func)**2,
                                     axis=1)

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        #######################################################################################
        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')
        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        #######################################################################################
        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)).dot(
                J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu), [(self.variables.hessian, hessian_matrix)]]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Beispiel #12
0
    def init_train_updates(self):
        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix +
                                                   penalty_const *
                                                   T.eye(n_parameters))

        updated_parameters = param_vector - hessian_inverse.dot(full_gradient)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Beispiel #13
0
 def init_variables(self):
     super(QuasiNewton, self).init_variables()
     n_params = sum(p.get_value().size for p in iter_parameters(self))
     self.variables.update(
         inv_hessian=theano.shared(
             name='inv_hessian',
             value=asfloat(self.h0_scale * np.eye(int(n_params))),
         ),
         prev_params=theano.shared(
             name='prev_params',
             value=asfloat(np.zeros(n_params)),
         ),
         prev_full_gradient=theano.shared(
             name='prev_full_gradient',
             value=asfloat(np.zeros(n_params)),
         ),
     )
Beispiel #14
0
 def init_variables(self):
     super(QuasiNewton, self).init_variables()
     n_params = sum(p.get_value().size for p in iter_parameters(self))
     self.variables.update(
         inv_hessian=theano.shared(
             name='inv_hessian',
             value=asfloat(self.h0_scale * np.eye(int(n_params))),
         ),
         prev_params=theano.shared(
             name='prev_params',
             value=asfloat(np.zeros(n_params)),
         ),
         prev_full_gradient=theano.shared(
             name='prev_full_gradient',
             value=asfloat(np.zeros(n_params)),
         ),
     )
Beispiel #15
0
    def init_train_updates(self):
        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters
        )
        hessian_inverse = T.nlinalg.matrix_inverse(
            hessian_matrix + penalty_const * T.eye(n_parameters)
        )

        updated_parameters = param_vector - hessian_inverse.dot(full_gradient)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Beispiel #16
0
    def init_param_updates(self, layer, parameter):

        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')

        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        prev_mean_squred_grad = parameter.prev_mean_squred_grad
        step = self.variables.step
        gradient = T.grad(self.variables.error_func, wrt=parameter)

        mean_squred_grad = (self.decay * prev_mean_squred_grad +
                            (1 - self.decay) * gradient**2)
        parameter_delta = gradient / T.sqrt(mean_squred_grad + self.epsilon)

        return [
            (prev_mean_squred_grad, mean_squred_grad),
            (parameter, parameter - step * parameter_delta),
        ]
Beispiel #17
0
    def init_train_updates(self):
        step = self.variables.step
        min_eigval = self.min_eigval
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative = T.grad(gradient.sum(), wrt=parameter)
            second_derivatives.append(second_derivative.flatten())

        hessian_diag = T.concatenate(second_derivatives)
        hessian_diag = T.switch(
            T.abs_(hessian_diag) < min_eigval,
            T.switch(
                hessian_diag < 0,
                -min_eigval,
                min_eigval,
            ),
            hessian_diag
        )

        # We divide gradient by Hessian diagonal elementwise is the same
        # as we just took diagonal Hessian inverse (which is
        # reciprocal for each diagonal element) and mutliply
        # by gradient. This operation is less clear, but works faster.
        updated_parameters = (
            param_vector -
            step * full_gradient / hessian_diag
        )
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates