Esempio n. 1
0
def hentenes_stiefel(old_g, new_g, delta_w, epsilon=1e-7):
    gradient_delta = new_g - old_g
    return safe_division(
        dot(gradient_delta, new_g),
        dot(delta_w, gradient_delta),
        epsilon,
    )
Esempio n. 2
0
def dfp(inv_H, delta_w, delta_grad, epsilon=1e-7):
    """
    DFP is a method very similar to BFGS. It's rank 2 formula update.
    It can suffer from round-off error and inaccurate line searches.
    """
    inv_H_dot_grad = dot(inv_H, delta_grad)

    x = safe_division(outer(delta_w, delta_w), dot(delta_grad, delta_w),
                      epsilon)
    y = safe_division(tf.matmul(outer(inv_H_dot_grad, delta_grad), inv_H),
                      dot(delta_grad, inv_H_dot_grad), epsilon)

    return inv_H - y + x
Esempio n. 3
0
def sr1(inv_H, delta_w, delta_grad, epsilon=1e-7):
    """
    Symmetric rank 1 (SR1). Generates update for the inverse hessian
    matrix adding symmetric rank-1 matrix. It's possible that there is no
    rank 1 updates for the matrix and in this case update won't be applied
    and original inverse hessian will be returned.
    """
    param = delta_w - dot(inv_H, delta_grad)
    denominator = dot(param, delta_grad)

    return tf.where(
        # This check protects from the cases when update
        # doesn't exist. It's possible that during certain
        # iteration there is no rank-1 update for the matrix.
        tf.less(tf.abs(denominator),
                epsilon * tf.norm(param) * tf.norm(delta_grad)),
        inv_H,
        inv_H + outer(param, param) / denominator)
Esempio n. 4
0
def bfgs(inv_H, delta_w, delta_grad, epsilon=1e-7):
    """
    It can suffer from round-off error and inaccurate line searches.
    """
    n_parameters = int(inv_H.shape[0])

    I = tf.eye(n_parameters)
    rho = safe_reciprocal(dot(delta_grad, delta_w), epsilon)

    X = I - outer(delta_w, delta_grad) * rho
    X_T = tf.transpose(X)
    Z = rho * outer(delta_w, delta_w)

    return tf.matmul(X, tf.matmul(inv_H, X_T)) + Z
Esempio n. 5
0
    def init_train_updates(self):
        self.init_variables()

        iteration = self.variables.iteration
        inv_hessian = self.variables.inv_hessian
        prev_params = self.variables.prev_params
        prev_full_gradient = self.variables.prev_full_gradient

        variables = self.network.variables
        params = [var for var in variables.values() if var.trainable]
        param_vector = make_single_vector(params)

        gradients = tf.gradients(self.variables.loss, params)
        full_gradient = make_single_vector(gradients)

        new_inv_hessian = tf.where(
            tf.equal(iteration, 0), inv_hessian,
            self.update_function(inv_H=inv_hessian,
                                 delta_w=param_vector - prev_params,
                                 delta_grad=full_gradient - prev_full_gradient,
                                 epsilon=self.epsilon))
        param_delta = -dot(new_inv_hessian, full_gradient)
        step = self.find_optimal_step(param_vector, param_delta)
        updated_params = param_vector + step * param_delta
        updates = setup_parameter_updates(params, updated_params)

        # We have to compute these values first, otherwise
        # parallelization, in tensorflow, can mix update order
        # and, for example, previous gradient can be equal to
        # current gradient value. It happens because tensorflow
        # try to execute operations in parallel.
        required_variables = [new_inv_hessian, param_vector, full_gradient]
        with tf.control_dependencies(required_variables):
            updates.extend([
                inv_hessian.assign(new_inv_hessian),
                prev_params.assign(param_vector),
                prev_full_gradient.assign(full_gradient),
                iteration.assign(iteration + 1),
            ])

        return updates
Esempio n. 6
0
        def free_energy(visible_sample):
            with tf.name_scope('free-energy'):
                wx = tf.matmul(visible_sample, self.weight)
                wx_b = wx + self.hidden_bias

                visible_bias_term = dot(visible_sample, self.visible_bias)

                # We can get infinity when wx_b is a relatively large number
                # (maybe 100). Taking exponent makes it even larger and
                # for with float32 it can convert it to infinity. But because
                # number is so large we don't care about +1 value before taking
                # logarithms and therefore we can just pick value as it is
                # since our operation won't change anything.
                hidden_terms = tf.where(
                    # exp(30) is such a big number that +1 won't
                    # make any difference in the outcome.
                    tf.greater(wx_b, 30),
                    wx_b,
                    tf.log1p(tf.exp(wx_b)),
                )

                hidden_term = tf.reduce_sum(hidden_terms, axis=1)
                return -(visible_bias_term + hidden_term)
Esempio n. 7
0
def dai_yuan(old_g, new_g, delta_w, epsilon=1e-7):
    return safe_division(
        dot(new_g, new_g),
        dot(new_g - old_g, delta_w),
        epsilon,
    )
Esempio n. 8
0
def liu_storey(old_g, new_g, delta_w, epsilon=1e-7):
    return -safe_division(
        dot(new_g, new_g - old_g),
        dot(delta_w, old_g),
        epsilon,
    )
Esempio n. 9
0
def polak_ribiere(old_g, new_g, delta_w, epsilon=1e-7):
    return safe_division(
        dot(new_g, new_g - old_g),
        dot(old_g, old_g),
        epsilon,
    )
Esempio n. 10
0
def fletcher_reeves(old_g, new_g, delta_w, epsilon=1e-7):
    return safe_division(
        dot(new_g, new_g),
        dot(old_g, old_g),
        epsilon,
    )