def testL2Regularization(self):
    # Define Loss(x) := ||x - a||_2**2, where a is a constant.
    # Set l1_regularizer = 0 and l2_regularizer = 1.
    # Then the regularized loss is
    #
    #     ||x - a||_2**2 + ||x||_2**2
    #
    # And the true optimum is x = 0.5 * a.
    n = 100
    np.random.seed(42)
    a_ = np.random.random(size=(n,))
    a = self._adjust_dtype_and_shape_hints(a_)

    def _grad_and_hessian_unregularized_loss_fn(x):
      grad = 2 * (x - a)
      hessian_outer = tf.eye(n, dtype=a.dtype)
      hessian_middle = 2. * tf.ones_like(a)
      return grad, hessian_outer, hessian_middle

    w, is_converged, num_iter = minimize_sparse(
        _grad_and_hessian_unregularized_loss_fn,
        x_start=tf.zeros_like(a_, dtype=self.dtype),
        l1_regularizer=0.,
        l2_regularizer=1.,
        maximum_iterations=4,
        maximum_full_sweeps_per_iteration=4,
        tolerance=1e-5,
        learning_rate=1.)

    w_, is_converged_, _ = self.evaluate([w, is_converged, num_iter])

    expected_w = 0.5 * a
    self.assertAllEqual(is_converged_, True)
    self.assertAllClose(w_, expected_w, atol=0., rtol=0.03)
Esempio n. 2
0
    def testL2Regularization(self):
        # Define Loss(x) := ||x - a||_2**2, where a is a constant.
        # Set l1_regularizer = 0 and l2_regularizer = 1.
        # Then the regularized loss is
        #
        #     ||x - a||_2**2 + ||x||_2**2
        #
        # And the true optimum is x = 0.5 * a.
        n = 100
        np.random.seed(42)
        a_ = np.random.random(size=(n, ))
        a = self.adjust_dtype_and_shape_hints(a_)

        def _grad_and_hessian_unregularized_loss_fn(x):
            grad = 2 * (x - a)
            hessian_outer = tf.eye(n, dtype=a.dtype)
            hessian_middle = 2. * tf.ones_like(a)
            return grad, hessian_outer, hessian_middle

        w, is_converged, num_iter = minimize_sparse(
            _grad_and_hessian_unregularized_loss_fn,
            x_start=tf.zeros_like(a_, dtype=self.dtype),
            l1_regularizer=0.,
            l2_regularizer=1.,
            maximum_iterations=4,
            maximum_full_sweeps_per_iteration=4,
            tolerance=1e-5,
            learning_rate=1.)

        w_, is_converged_, _ = self.evaluate([w, is_converged, num_iter])

        expected_w = 0.5 * a
        self.assertAllEqual(is_converged_, True)
        self.assertAllClose(w_, expected_w, atol=0., rtol=0.03)
  def testFindingSparseSolution(self):
    # Test that Proximal Hessian descent prefers sparse solutions when
    # l1_regularization_weight is large enough.
    #
    # Define
    #
    #     Loss(x) := (x[0] - a[0])**2 + epsilon * sum(
    #                    (x[i] - a[i])**2 for i in range(1, n))
    #
    # where `a` is a constant and epsilon is small.  Set
    # l2_regularization_weight = 0 and set l1_regularization_weight such that
    #
    #     epsilon << l1_regularization_weight << 1.
    #
    # L1 regularization should cause the computed optimum to have zeros in all
    # but the 0th coordinate: optimal_x ~= [a[0], 0, ..., 0].
    n = 100
    epsilon = 1e-6
    # Set a[0] explicitly to make sure it's not very close to zero
    a0 = 6.
    np.random.seed(10)
    a_ = np.concatenate([[a0], np.random.random(size=(n - 1,))], axis=0)
    a = self.adjust_dtype_and_shape_hints(a_)

    def _grad_and_hessian_unregularized_loss_fn(x):
      diff = x - a
      grad = 2. * tf.concat([[diff[0]], epsilon * diff[1:]], axis=0)
      hessian_outer = tf.SparseTensor(
          indices=[(i, i) for i in range(n)],
          values=tf.ones_like(a),
          dense_shape=[n, n])
      hessian_middle = 2. * tf.concat(
          [[1.], epsilon * tf.ones([n - 1], dtype=self.dtype)], axis=0)
      return grad, hessian_outer, hessian_middle

    w, is_converged, num_iter = minimize_sparse(
        _grad_and_hessian_unregularized_loss_fn,
        x_start=tf.zeros([n], dtype=self.dtype),
        l1_regularization_weight=1e-2,
        l2_regularization_weight=None,
        maximum_iterations=10,
        maximum_full_sweeps_per_iteration=10,
        tolerance=1e-5,
        learning_rate=1.)

    init_op = tf.global_variables_initializer()
    self.evaluate(init_op)
    w_, is_converged_, _ = self.evaluate([w, is_converged, num_iter])

    expected_w = tf.concat([[a[0]], tf.zeros([n - 1], self.dtype)], axis=0)
    # Using atol=0 ensures that w must be exactly zero in all coordinates
    # where expected_w is exactly zero.
    self.assertAllEqual(is_converged_, True)
    self.assertAllClose(w_, expected_w, atol=0., rtol=1e-3)
Esempio n. 4
0
    def testNumIter(self):
        # Same as testL2Regularization, except we set
        # maximum_full_sweeps_per_iteration = 1 and check that the number of sweeps
        # is equals what we expect it to (usually we don't know the exact number,
        # but in this simple case we do -- explanation below).
        #
        # Since l1_regularizer = 0, the soft threshold operator is actually the
        # identity operator, hence the `minimize_sparse` algorithm becomes literally
        # coordinatewise Newton's method being used to find the zeros of grad
        # Loss(x), which in this case is a linear function of x.  Hence Newton's
        # method should find the exact correct answer in 1 sweep.  At the end of the
        # first sweep the algorithm does not yet know it has converged; it takes a
        # second sweep, when the algorithm notices that its answer hasn't changed at
        # all, to become aware that convergence has happened.  Hence we expect two
        # sweeps.  So with maximum_full_sweeps_per_iteration = 1, that means we
        # expect 2 iterations of the outer loop.
        n = 100
        np.random.seed(42)
        a_ = np.random.random(size=(n, ))
        a = self.adjust_dtype_and_shape_hints(a_)

        def _grad_and_hessian_unregularized_loss_fn(x):
            grad = 2 * (x - a)
            hessian_outer = tf.diag(tf.ones_like(a))
            hessian_middle = 2. * tf.ones_like(a)
            return grad, hessian_outer, hessian_middle

        w, is_converged, num_iter = minimize_sparse(
            _grad_and_hessian_unregularized_loss_fn,
            x_start=tf.zeros_like(a_, dtype=self.dtype),
            l1_regularizer=0.,
            l2_regularizer=1.,
            maximum_iterations=4,
            maximum_full_sweeps_per_iteration=1,
            tolerance=1e-5,
            learning_rate=1.)

        init_op = tf.global_variables_initializer()

        self.evaluate(init_op)
        w_, is_converged_, num_iter_ = self.evaluate(
            [w, is_converged, num_iter])

        expected_w = 0.5 * a
        self.assertAllEqual(is_converged_, True)
        self.assertAllEqual(num_iter_, 2)
        self.assertAllClose(w_, expected_w, atol=0., rtol=0.03)
  def testNumIter(self):
    # Same as testL2Regularization, except we set
    # maximum_full_sweeps_per_iteration = 1 and check that the number of sweeps
    # is equals what we expect it to (usually we don't know the exact number,
    # but in this simple case we do -- explanation below).
    #
    # Since l1_regularizer = 0, the soft threshold operator is actually the
    # identity operator, hence the `minimize_sparse` algorithm becomes literally
    # coordinatewise Newton's method being used to find the zeros of grad
    # Loss(x), which in this case is a linear function of x.  Hence Newton's
    # method should find the exact correct answer in 1 sweep.  At the end of the
    # first sweep the algorithm does not yet know it has converged; it takes a
    # second sweep, when the algorithm notices that its answer hasn't changed at
    # all, to become aware that convergence has happened.  Hence we expect two
    # sweeps.  So with maximum_full_sweeps_per_iteration = 1, that means we
    # expect 2 iterations of the outer loop.
    n = 100
    np.random.seed(42)
    a_ = np.random.random(size=(n,))
    a = self._adjust_dtype_and_shape_hints(a_)

    def _grad_and_hessian_unregularized_loss_fn(x):
      grad = 2 * (x - a)
      hessian_outer = tf.diag(tf.ones_like(a))
      hessian_middle = 2. * tf.ones_like(a)
      return grad, hessian_outer, hessian_middle

    w, is_converged, num_iter = minimize_sparse(
        _grad_and_hessian_unregularized_loss_fn,
        x_start=tf.zeros_like(a_, dtype=self.dtype),
        l1_regularizer=0.,
        l2_regularizer=1.,
        maximum_iterations=4,
        maximum_full_sweeps_per_iteration=1,
        tolerance=1e-5,
        learning_rate=1.)

    w_, is_converged_, num_iter_ = self.evaluate([w, is_converged, num_iter])

    expected_w = 0.5 * a
    self.assertAllEqual(is_converged_, True)
    self.assertAllEqual(num_iter_, 2)
    self.assertAllClose(w_, expected_w, atol=0., rtol=0.03)
  def _test_finding_sparse_solution(self, batch_shape=None):
    # Test that Proximal Hessian descent prefers sparse solutions when
    # l1_regularizer is large enough.
    #
    # Define
    #
    #     Loss(x) := (x[0] - a[0])**2 + epsilon * sum(
    #                    (x[i] - a[i])**2 for i in range(1, n))
    #
    # where `a` is a constant and epsilon is small.  Set
    # l2_regularizer = 0 and set l1_regularizer such that
    #
    #     epsilon << l1_regularizer << 1.
    #
    # L1 regularization should cause the computed optimum to have zeros in all
    # but the 0th coordinate: optimal_x ~= [a[0], 0, ..., 0].
    n = 10
    epsilon = 1e-6
    if batch_shape is None:
      batch_shape = []
    # Set a[0] explicitly to make sure it's not very close to zero
    a0 = 6.
    a_ = np.concatenate([
        np.full(batch_shape + [1], a0),
        np.random.random(size=batch_shape + [n - 1])
    ],
                        axis=-1)
    a = self._adjust_dtype_and_shape_hints(a_)

    def _grad_and_hessian_unregularized_loss_fn(x):
      diff = x - a
      grad = 2. * tf.concat([diff[..., :1], epsilon * diff[..., 1:]], axis=-1)

      hessian_outer = tf.SparseTensor(
          indices=[
              b + (i, i) for i in range(n) for b in np.ndindex(*batch_shape)
          ],
          values=tf.ones(shape=[np.product(batch_shape) * n], dtype=self.dtype),
          dense_shape=batch_shape + [n, n])

      hessian_middle_per_batch = 2 * tf.concat(
          [[1.], epsilon * tf.ones([n - 1], dtype=self.dtype)], axis=0)
      hessian_middle = tf.zeros(
          batch_shape + [n], dtype=self.dtype) + hessian_middle_per_batch
      return grad, hessian_outer, hessian_middle

    w, is_converged, num_iter = minimize_sparse(
        _grad_and_hessian_unregularized_loss_fn,
        x_start=tf.zeros(batch_shape + [n], dtype=self.dtype),
        l1_regularizer=1e-2,
        l2_regularizer=None,
        maximum_iterations=10,
        maximum_full_sweeps_per_iteration=10,
        tolerance=1e-5,
        learning_rate=1.)

    w_, is_converged_, _ = self.evaluate([w, is_converged, num_iter])

    expected_w = tf.concat(
        [a[..., :1], tf.zeros(batch_shape + [n - 1], self.dtype)], axis=-1)

    # Using atol=0 ensures that w must be exactly zero in all coordinates
    # where expected_w is exactly zero.
    self.assertAllEqual(is_converged_, True)
    self.assertAllClose(w_, expected_w, atol=0., rtol=1e-3)
  def _test_finding_sparse_solution(self, batch_shape=None):
    # Test that Proximal Hessian descent prefers sparse solutions when
    # l1_regularizer is large enough.
    #
    # Define
    #
    #     Loss(x) := (x[0] - a[0])**2 + epsilon * sum(
    #                    (x[i] - a[i])**2 for i in range(1, n))
    #
    # where `a` is a constant and epsilon is small.  Set
    # l2_regularizer = 0 and set l1_regularizer such that
    #
    #     epsilon << l1_regularizer << 1.
    #
    # L1 regularization should cause the computed optimum to have zeros in all
    # but the 0th coordinate: optimal_x ~= [a[0], 0, ..., 0].
    n = 10
    epsilon = 1e-6
    if batch_shape is None:
      batch_shape = []
    # Set a[0] explicitly to make sure it's not very close to zero
    a0 = 6.
    a_ = np.concatenate([
        np.full(batch_shape + [1], a0),
        np.random.random(size=batch_shape + [n - 1])
    ],
                        axis=-1)
    a = self._adjust_dtype_and_shape_hints(a_)

    def _grad_and_hessian_unregularized_loss_fn(x):
      diff = x - a
      grad = 2. * tf.concat([diff[..., :1], epsilon * diff[..., 1:]], axis=-1)

      hessian_outer = tf.SparseTensor(
          indices=[
              b + (i, i) for i in range(n) for b in np.ndindex(*batch_shape)
          ],
          values=tf.ones(shape=[np.product(batch_shape) * n], dtype=self.dtype),
          dense_shape=batch_shape + [n, n])

      hessian_middle_per_batch = 2 * tf.concat(
          [[1.], epsilon * tf.ones([n - 1], dtype=self.dtype)], axis=0)
      hessian_middle = tf.zeros(
          batch_shape + [n], dtype=self.dtype) + hessian_middle_per_batch
      return grad, hessian_outer, hessian_middle

    w, is_converged, num_iter = minimize_sparse(
        _grad_and_hessian_unregularized_loss_fn,
        x_start=tf.zeros(batch_shape + [n], dtype=self.dtype),
        l1_regularizer=1e-2,
        l2_regularizer=None,
        maximum_iterations=10,
        maximum_full_sweeps_per_iteration=10,
        tolerance=1e-5,
        learning_rate=1.)

    w_, is_converged_, _ = self.evaluate([w, is_converged, num_iter])

    expected_w = tf.concat(
        [a[..., :1], tf.zeros(batch_shape + [n - 1], self.dtype)], axis=-1)

    # Using atol=0 ensures that w must be exactly zero in all coordinates
    # where expected_w is exactly zero.
    self.assertAllEqual(is_converged_, True)
    self.assertAllClose(w_, expected_w, atol=0., rtol=1e-3)