Example #1
0
    def init_train_updates(self):
        step = self.variables.step
        min_eigval = self.min_eigval
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative = T.grad(gradient.sum(), wrt=parameter)
            second_derivatives.append(second_derivative.flatten())

        hessian_diag = T.concatenate(second_derivatives)
        hessian_diag = T.switch(
            T.abs_(hessian_diag) < min_eigval,
            T.switch(
                hessian_diag < 0,
                -min_eigval,
                min_eigval,
            ), hessian_diag)

        # We divide gradient by Hessian diagonal elementwise is the same
        # as we just took diagonal Hessian inverse (which is
        # reciprocal for each diagonal element) and mutliply
        # by gradient. This operation is less clear, but works faster.
        updated_parameters = (param_vector -
                              step * full_gradient / hessian_diag)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #2
0
    def init_train_updates(self):
        step = self.variables.step
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        beta = self.update_function(previous_gradient, full_gradient,
                                    previous_delta)
        parameter_delta = ifelse(
            T.eq(T.mod(self.variables.epoch, n_parameters), 1),
            -full_gradient,
            -full_gradient + beta * previous_delta
        )
        updated_parameters = param_vector + step * parameter_delta

        updates = [
            (previous_gradient, full_gradient),
            (previous_delta, parameter_delta),
        ]
        parameter_updates = setup_parameter_updates(parameters,
                                                    updated_parameters)
        updates.extend(parameter_updates)

        return updates
Example #3
0
    def init_train_updates(self):
        step = self.variables.step
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in parameters])

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        beta = self.update_function(previous_gradient, full_gradient,
                                    previous_delta)
        parameter_delta = ifelse(
            T.eq(T.mod(self.variables.epoch, n_parameters), 1), -full_gradient,
            -full_gradient + beta * previous_delta)
        updated_parameters = param_vector + step * parameter_delta

        updates = [
            (previous_gradient, full_gradient),
            (previous_delta, parameter_delta),
        ]
        parameter_updates = setup_parameter_updates(parameters,
                                                    updated_parameters)
        updates.extend(parameter_updates)

        return updates
Example #4
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        se_for_each_sample = (
            (network_output - prediction_func) ** 2
        ).ravel()

        params = parameter_values(self.connection)
        param_vector = parameters2vector(self)

        J = compute_jacobian(se_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - slinalg.solve(
            J.T.dot(J) + new_mu * T.eye(n_params),
            J.T.dot(se_for_each_sample)
        )

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #5
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean((network_output - prediction_func)**2,
                                     axis=1)

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)).dot(
                J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #6
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = tf.where(
            tf.less(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        err_for_each_sample = flatten((network_output - prediction_func) ** 2)

        params = parameter_values(self.connection)
        param_vector = make_single_vector(params)

        J = compute_jacobian(err_for_each_sample, params)
        J_T = tf.transpose(J)
        n_params = J.shape[1]

        parameter_update = tf.matrix_solve(
            tf.matmul(J_T, J) + new_mu * tf.eye(n_params.value),
            tf.matmul(J_T, tf.expand_dims(err_for_each_sample, 1))
        )
        updated_params = param_vector - flatten(parameter_update)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #7
0
    def init_train_updates(self):
        step = self.variables.step
        inv_min_eigval = 1 / self.min_eigval
        parameters = parameter_values(self.connection)
        param_vector = make_single_vector(parameters)

        gradients = tf.gradients(self.variables.error_func, parameters)
        full_gradient = make_single_vector(gradients)

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative, = tf.gradients(gradient, parameter)
            second_derivatives.append(flatten(second_derivative))

        hessian_diag = tf.concat(second_derivatives, axis=0)

        # it's easier to clip inverse hessian rather than the hessian,.
        inv_hessian_diag = tf.clip_by_value(
            # inverse for diagonal matrix easy to compute with
            # elementwise inverse operation.
            1 / hessian_diag,
            -inv_min_eigval,
            inv_min_eigval,
        )
        updates = setup_parameter_updates(
            parameters, param_vector - step * full_gradient * inv_hessian_diag)
        return updates
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        se_for_each_sample = ((network_output - prediction_func)**2).ravel()

        params = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in params])

        J = compute_jacobian(se_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - slinalg.solve(
            J.T.dot(J) + new_mu * T.eye(n_params), J.T.dot(se_for_each_sample))

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #9
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean(
            (network_output - prediction_func) ** 2,
            axis=1
        )

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)
        ).dot(J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu)]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #10
0
    def init_train_updates(self):
        network_inputs = self.variables.network_inputs
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in params])

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_hessian, param_vector - prev_params,
                                 full_gradient - prev_full_gradient))
        param_delta = -new_inv_hessian.dot(full_gradient)
        layers_and_parameters = list(iter_parameters(self.layers))

        def prediction(step):
            updated_params = param_vector + step * param_delta

            # This trick allow us to replace shared variables
            # with theano variables and get output from the network
            start_pos = 0
            for layer, attrname, param in layers_and_parameters:
                end_pos = start_pos + param.size
                updated_param_value = T.reshape(
                    updated_params[start_pos:end_pos], param.shape)
                setattr(layer, attrname, updated_param_value)
                start_pos = end_pos

            output = self.connection.output(*network_inputs)

            # Restore previous parameters
            for layer, attrname, param in layers_and_parameters:
                setattr(layer, attrname, param)

            return output

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
Example #11
0
    def init_train_updates(self):
        network_input = self.variables.network_input
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1),
            inv_hessian,
            self.update_function(inv_hessian,
                                 param_vector - prev_params,
                                 full_gradient - prev_full_gradient)
        )
        param_delta = -new_inv_hessian.dot(full_gradient)

        def prediction(step):
            # TODO: I need to update this ugly solution later
            updated_params = param_vector + step * param_delta

            layer_input = network_input
            start_pos = 0
            for layer in self.layers:
                for param in layer.parameters:
                    end_pos = start_pos + param.size
                    parameter_name, parameter_id = param.name.split('_')
                    setattr(layer, parameter_name, T.reshape(
                        updated_params[start_pos:end_pos],
                        param.shape
                    ))
                    start_pos = end_pos
                layer_input = layer.output(layer_input)
            return layer_input

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates
Example #12
0
    def init_train_updates(self):
        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in parameters])
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        updated_parameters = param_vector - slinalg.solve(
            hessian_matrix + penalty_const * T.eye(n_parameters),
            full_gradient)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #13
0
    def init_train_updates(self):
        penalty_const = asfloat(self.penalty_const)

        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = make_single_vector(parameters)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        parameter_update = tf.matrix_solve(
            hessian_matrix + penalty_const * tf.eye(n_parameters),
            tf.reshape(full_gradient, [-1, 1]))
        updated_parameters = param_vector - flatten(parameter_update)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #14
0
    def init_train_updates(self):
        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = T.concatenate([param.flatten() for param in parameters])
        penalty_const = asfloat(self.penalty_const)
        print n_parameters
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        updated_parameters = hessian_matrix
        updates = setup_parameter_updates([self.variables.hessian],
                                          updated_parameters)

        return updates
Example #15
0
    def init_train_updates(self):
        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)
        hessian_inverse = T.nlinalg.matrix_inverse(hessian_matrix +
                                                   penalty_const *
                                                   T.eye(n_parameters))

        updated_parameters = param_vector - hessian_inverse.dot(full_gradient)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #16
0
    def init_train_updates(self):
        n_parameters = count_parameters(self)
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters
        )
        hessian_inverse = T.nlinalg.matrix_inverse(
            hessian_matrix + penalty_const * T.eye(n_parameters)
        )

        updated_parameters = param_vector - hessian_inverse.dot(full_gradient)
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #17
0
    def init_train_updates(self):
        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = parameters2vector(self)
        penalty_const = asfloat(self.penalty_const)

        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters
        )

        updated_parameters = param_vector - slinalg.solve(
            hessian_matrix + penalty_const * T.eye(n_parameters),
            full_gradient
        )
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #18
0
    def init_train_updates(self):
        network_output = self.variables.network_output
        prediction_func = self.variables.train_prediction_func
        last_error = self.variables.last_error
        error_func = self.variables.error_func
        mu = self.variables.mu

        new_mu = ifelse(
            T.lt(last_error, error_func),
            mu * self.mu_update_factor,
            mu / self.mu_update_factor,
        )

        mse_for_each_sample = T.mean((network_output - prediction_func)**2,
                                     axis=1)

        params = list(iter_parameters(self))
        param_vector = parameters2vector(self)
        #######################################################################################
        n_parameters = count_parameters(self)
        self.variables.hessian = theano.shared(value=asfloat(
            np.zeros((n_parameters, n_parameters))),
                                               name='hessian_inverse')
        parameters = list(iter_parameters(self))
        hessian_matrix, full_gradient = find_hessian_and_gradient(
            self.variables.error_func, parameters)

        #######################################################################################
        J = compute_jaccobian(mse_for_each_sample, params)
        n_params = J.shape[1]

        updated_params = param_vector - T.nlinalg.matrix_inverse(
            J.T.dot(J) + new_mu * T.eye(n_params)).dot(
                J.T).dot(mse_for_each_sample)

        updates = [(mu, new_mu), [(self.variables.hessian, hessian_matrix)]]
        parameter_updates = setup_parameter_updates(params, updated_params)
        updates.extend(parameter_updates)

        return updates
Example #19
0
    def init_train_updates(self):
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = make_single_vector(params)

        gradients = tf.gradients(self.variables.error_func, params)
        full_gradient = make_single_vector(gradients)

        new_inv_hessian = tf.where(
            tf.equal(self.variables.epoch, 1), inv_hessian,
            self.update_function(inv_H=inv_hessian,
                                 delta_w=param_vector - prev_params,
                                 delta_grad=full_gradient - prev_full_gradient,
                                 epsilon=self.epsilon))
        param_delta = -dot(new_inv_hessian, full_gradient)
        step = self.find_optimal_step(param_vector, param_delta)
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        # We have to compute these values first, otherwise
        # parallelization in tensorflow can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        required_variables = [new_inv_hessian, param_vector, full_gradient]
        with tf.control_dependencies(required_variables):
            updates.extend([
                inv_hessian.assign(new_inv_hessian),
                prev_params.assign(param_vector),
                prev_full_gradient.assign(full_gradient),
            ])

        return updates
Example #20
0
    def init_train_updates(self):
        step = self.variables.step
        min_eigval = self.min_eigval
        parameters = list(iter_parameters(self))
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=parameters)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        second_derivatives = []
        for parameter, gradient in zip(parameters, gradients):
            second_derivative = T.grad(gradient.sum(), wrt=parameter)
            second_derivatives.append(second_derivative.flatten())

        hessian_diag = T.concatenate(second_derivatives)
        hessian_diag = T.switch(
            T.abs_(hessian_diag) < min_eigval,
            T.switch(
                hessian_diag < 0,
                -min_eigval,
                min_eigval,
            ),
            hessian_diag
        )

        # We divide gradient by Hessian diagonal elementwise is the same
        # as we just took diagonal Hessian inverse (which is
        # reciprocal for each diagonal element) and mutliply
        # by gradient. This operation is less clear, but works faster.
        updated_parameters = (
            param_vector -
            step * full_gradient / hessian_diag
        )
        updates = setup_parameter_updates(parameters, updated_parameters)

        return updates
Example #21
0
    def init_train_updates(self):
        step = self.variables.step
        epoch = self.variables.epoch
        previous_delta = self.variables.prev_delta
        previous_gradient = self.variables.prev_gradient

        n_parameters = count_parameters(self.connection)
        parameters = parameter_values(self.connection)
        param_vector = make_single_vector(parameters)

        gradients = tf.gradients(self.variables.error_func, parameters)
        full_gradient = make_single_vector(gradients)

        beta = self.update_function(previous_gradient, full_gradient,
                                    previous_delta, self.epsilon)

        parameter_delta = tf.where(tf.equal(tf.mod(epoch, n_parameters),
                                            1), -full_gradient,
                                   -full_gradient + beta * previous_delta)

        step = self.find_optimal_step(param_vector, parameter_delta)
        updated_parameters = param_vector + step * parameter_delta
        updates = setup_parameter_updates(parameters, updated_parameters)

        # We have to compute these values first, otherwise
        # parallelization in tensorflow can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        with tf.control_dependencies([full_gradient, parameter_delta]):
            updates.extend([
                previous_gradient.assign(full_gradient),
                previous_delta.assign(parameter_delta),
            ])

        return updates
Example #22
0
    def init_train_updates(self):
        network_input = self.variables.network_input
        network_output = self.variables.network_output
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        params = parameter_values(self.connection)
        param_vector = parameters2vector(self)

        gradients = T.grad(self.variables.error_func, wrt=params)
        full_gradient = T.concatenate([grad.flatten() for grad in gradients])

        new_inv_hessian = ifelse(
            T.eq(self.variables.epoch, 1),
            inv_hessian,
            self.update_function(inv_hessian,
                                 param_vector - prev_params,
                                 full_gradient - prev_full_gradient)
        )
        param_delta = -new_inv_hessian.dot(full_gradient)
        layers_and_parameters = list(iter_parameters(self.layers))

        def prediction(step):
            updated_params = param_vector + step * param_delta

            # This trick allow us to replace shared variables
            # with theano variables and get output from the network
            start_pos = 0
            for layer, attrname, param in layers_and_parameters:
                end_pos = start_pos + param.size
                updated_param_value = T.reshape(
                    updated_params[start_pos:end_pos],
                    param.shape
                )
                setattr(layer, attrname, updated_param_value)
                start_pos = end_pos

            output = self.connection.output(network_input)

            # We need to replace back parameter to shared variable
            for layer, attrname, param in layers_and_parameters:
                setattr(layer, attrname, param)

            return output

        def phi(step):
            return self.error(network_output, prediction(step))

        def derphi(step):
            error_func = self.error(network_output, prediction(step))
            return T.grad(error_func, wrt=step)

        step = asfloat(line_search(phi, derphi))
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        updates.extend([
            (inv_hessian, new_inv_hessian),
            (prev_params, param_vector),
            (prev_full_gradient, full_gradient),
        ])

        return updates