Esempio n. 1
0
  def testMat2dToFullyConnectedLayerParamsTensor(self):
    with ops.Graph().as_default(), self.test_session() as sess:
      random_seed.set_random_seed(200)
      vector_template = self._fully_connected_layer_params()[0]
      mat2d = array_ops.constant([[5., 4.], [3., 2.]])

      output = sess.run(utils.mat2d_to_layer_params(vector_template, mat2d))

      self.assertAllClose(output, np.array([[5., 4.], [3., 2.]]))
 def multiply_matpower(self, vector, exp):
   reshaped_vector = utils.layer_params_to_mat2d(vector)
   reshaped_out = self._output_factor.right_multiply_matpower(
       reshaped_vector, exp, self._output_damping_func)
   reshaped_out = self._input_factor.left_multiply_matpower(
       reshaped_out, exp, self._input_damping_func)
   if self._renorm_coeff != 1.0:
     renorm_coeff = math_ops.cast(self._renorm_coeff, dtype=reshaped_out.dtype)
     reshaped_out *= math_ops.cast(renorm_coeff**exp, dtype=reshaped_out.dtype)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
 def multiply_inverse(self, vector):
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = self._output_factor.right_multiply_inverse(
         reshaped_vector, self._output_damping)
     reshaped_out = self._input_factor.left_multiply_inverse(
         reshaped_out, self._input_damping)
     if self._renorm_coeff != 1.0:
         reshaped_out /= math_ops.cast(self._renorm_coeff,
                                       dtype=reshaped_out.dtype)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 4
0
 def multiply_matpower(self, vector, exp):
   reshaped_vector = utils.layer_params_to_mat2d(vector)
   reshaped_out = self._output_factor.right_multiply_matpower(
       reshaped_vector, exp, self._output_damping_func)
   reshaped_out = self._input_factor.left_multiply_matpower(
       reshaped_out, exp, self._input_damping_func)
   if self._renorm_coeff != 1.0:
     reshaped_out *= math_ops.cast(
         self._renorm_coeff**exp, dtype=reshaped_out.dtype)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 5
0
    def testMat2dToFullyConnectedLayerParamsTensor(self):
        with ops.Graph().as_default(), self.test_session() as sess:
            random_seed.set_random_seed(200)
            vector_template = self._fully_connected_layer_params()[0]
            mat2d = array_ops.constant([[5., 4.], [3., 2.]])

            output = sess.run(
                utils.mat2d_to_layer_params(vector_template, mat2d))

            self.assertAllClose(output, np.array([[5., 4.], [3., 2.]]))
Esempio n. 6
0
 def multiply_inverse(self, vector):
   left_factor_inv = self._input_factor.get_inverse(self._input_damping)
   right_factor_inv = self._output_factor.get_inverse(self._output_damping)
   reshaped_vector = utils.layer_params_to_mat2d(vector)
   reshaped_out = math_ops.matmul(left_factor_inv,
                                  math_ops.matmul(reshaped_vector,
                                                  right_factor_inv))
   if self._renorm_coeff != 1.0:
     reshaped_out /= math_ops.cast(
         self._renorm_coeff, dtype=reshaped_out.dtype)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
 def multiply(self, vector):
     left_factor = self._input_factor.get_cov()
     right_factor = self._output_factor.get_cov()
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = (math_ops.matmul(reshaped_vector, right_factor) +
                     self._output_damping * reshaped_vector)
     reshaped_out = (math_ops.matmul(left_factor, reshaped_out) +
                     self._input_damping * reshaped_out)
     if self._renorm_coeff != 1.0:
         reshaped_out *= math_ops.cast(self._renorm_coeff,
                                       dtype=reshaped_out.dtype)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 8
0
 def multiply(self, vector):
   left_factor = self._input_factor.get_cov()
   right_factor = self._output_factor.get_cov()
   reshaped_vector = utils.layer_params_to_mat2d(vector)
   reshaped_out = (math_ops.matmul(reshaped_vector, right_factor) +
                   self._output_damping * reshaped_vector)
   reshaped_out = (math_ops.matmul(left_factor, reshaped_out) +
                   self._input_damping * reshaped_out)
   if self._renorm_coeff != 1.0:
     reshaped_out *= math_ops.cast(
         self._renorm_coeff, dtype=reshaped_out.dtype)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 9
0
  def testMat2dToFullyConnectedLayerParamsTuple(self):
    with ops.Graph().as_default(), self.test_session() as sess:
      random_seed.set_random_seed(200)
      vector_template = self._fully_connected_layer_params()
      mat2d = array_ops.constant([[5., 4.], [3., 2.], [1., 0.]])

      output = sess.run(utils.mat2d_to_layer_params(vector_template, mat2d))

      self.assertIsInstance(output, tuple)
      self.assertEqual(len(output), 2)
      a, b = output
      self.assertAllClose(a, np.array([[5., 4.], [3., 2.]]))
      self.assertAllClose(b, np.array([1., 0.]))
Esempio n. 10
0
    def testMat2dToFullyConnectedLayerParamsTuple(self):
        with ops.Graph().as_default(), self.test_session() as sess:
            random_seed.set_random_seed(200)
            vector_template = self._fully_connected_layer_params()
            mat2d = array_ops.constant([[5., 4.], [3., 2.], [1., 0.]])

            output = sess.run(
                utils.mat2d_to_layer_params(vector_template, mat2d))

            self.assertIsInstance(output, tuple)
            self.assertEqual(len(output), 2)
            a, b = output
            self.assertAllClose(a, np.array([[5., 4.], [3., 2.]]))
            self.assertAllClose(b, np.array([1., 0.]))
    def multiply(self, vector):
        """Approximate damped Fisher-vector product.

    Args:
      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
        [input_size, output_size] corresponding to layer's weights. If not, a
        2-tuple of the former and a Tensor of shape [output_size] corresponding
        to the layer's bias.

    Returns:
      Tensor of the same shape, corresponding to the Fisher-vector product.
    """
        reshaped_vect = utils.layer_params_to_mat2d(vector)
        reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
        return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 12
0
  def multiply(self, vector):
    """Approximate damped Fisher-vector product.

    Args:
      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
        [input_size, output_size] corresponding to layer's weights. If not, a
        2-tuple of the former and a Tensor of shape [output_size] corresponding
        to the layer's bias.

    Returns:
      Tensor of the same shape, corresponding to the Fisher-vector product.
    """
    reshaped_vect = utils.layer_params_to_mat2d(vector)
    reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
    return utils.mat2d_to_layer_params(vector, reshaped_out)
  def multiply_matpower(self, vector, exp):
    """Multiplies the vector by the (damped) matrix-power of the block.

    Args:
      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
        [input_size, output_size] corresponding to layer's weights. If not, a
        2-tuple of the former and a Tensor of shape [output_size] corresponding
        to the layer's bias.
      exp: A scalar representing the power to raise the block before multiplying
           it by the vector.

    Returns:
      The vector left-multiplied by the (damped) matrix-power of the block.
    """
    reshaped_vec = utils.layer_params_to_mat2d(vector)
    reshaped_out = self._factor.left_multiply_matpower(
        reshaped_vec, exp, self._damping_func)
    return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 14
0
  def multiply_matpower(self, vector, exp):
    """Multiplies the vector by the (damped) matrix-power of the block.

    Args:
      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
        [input_size, output_size] corresponding to layer's weights. If not, a
        2-tuple of the former and a Tensor of shape [output_size] corresponding
        to the layer's bias.
      exp: A scalar representing the power to raise the block before multiplying
           it by the vector.

    Returns:
      The vector left-multiplied by the (damped) matrix-power of the block.
    """
    reshaped_vec = utils.layer_params_to_mat2d(vector)
    reshaped_out = self._factor.left_multiply_matpower(
        reshaped_vec, exp, self._damping_func)
    return utils.mat2d_to_layer_params(vector, reshaped_out)
  def multiply_matpower(self, vector, exp):
    if exp != -1:
      raise NotImplementedError("FullyConnectedSeriesFB only supports inverse"
                                "multiplications.")

    # pylint: disable=invalid-name

    Z = utils.layer_params_to_mat2d(vector)

    # Derivations were done for "batch_dim==1" case so we need to convert to
    # that orientation:
    Z = array_ops.transpose(Z)

    if self._option == SeriesFBApproximation.option1:

      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
      L_A, psi_A = self._input_factor.get_option1quants(
          self._input_damping_func)
      L_G, psi_G = self._output_factor.get_option1quants(
          self._output_damping_func)

      def gamma(x):
        # We are assuming that each case has the same number of time-steps.
        # If this stops being the case one shouldn't simply replace this T
        # with its average value.  Instead, one needs to go back to the
        # definition of the gamma function from the paper.
        T = self._num_timesteps
        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))

      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
      # Even though Y is Z-independent we are recomputing it from the psi's
      # each since Y depends on both A and G quantities, and it is relatively
      # cheap to compute.
      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)

      # Z = L_G^T * Z * L_A
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = U_G^T * Z * U_A
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)

      # Z = Z .* Y
      Z *= Y

      # Z = L_G * Z * L_A^T
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = U_G * Z * U_A^T
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))

    elif self._option == SeriesFBApproximation.option2:

      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
      P_A, K_A, mu_A = self._input_factor.get_option2quants(
          self._input_damping_func)
      P_G, K_G, mu_G = self._output_factor.get_option2quants(
          self._output_damping_func)

      # Our approach differs superficially from the pseudo-code in the paper
      # in order to reduce the total number of matrix-matrix multiplies.
      # In particular, the first three computations in the pseudo code are
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = Z - hPsi_G^T * Z * hPsi_A
      # Z = E_G^T * Z * E_A
      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
      # the entire computation can be written as
      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
      # This final expression is computed by the following two lines:
      # Z = Z - P_G * Z * P_A^T
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
      # Z = K_G^T * Z * K_A
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)

      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
      # Be careful with the outer product.  We don't want to accidentally
      # make it an inner-product instead.
      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
      Z /= tmp

      # We now perform the transpose/reverse version of the operations
      # derived above, whose derivation from the original pseudo-code is
      # analgous.
      # Z = K_G * Z * K_A^T
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))

      # Z = Z - P_G^T * Z * P_A
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)

      # Z = normalize (1/E[T]) * Z
      # Note that this normalization is done because we compute the statistics
      # by averaging, not summing, over time. (And the gradient is presumably
      # summed over time, not averaged, and thus their scales are different.)
      Z /= math_ops.cast(self._num_timesteps, Z.dtype)

    # Convert back to the "batch_dim==0" orientation.
    Z = array_ops.transpose(Z)

    return utils.mat2d_to_layer_params(vector, Z)
 def multiply(self, vector):
     reshaped_vect = utils.layer_params_to_mat2d(vector)
     reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 17
0
 def multiply(self, vector):
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
 def multiply(self, vector):
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = self._factor.left_multiply(
       reshaped_vect, self._damping)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
 def multiply_matpower(self, vector, exp):
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = self._factor.left_multiply_matpower(
       reshaped_vect, exp, self._damping_func)
   return utils.mat2d_to_layer_params(vector, reshaped_out)
Esempio n. 20
0
  def multiply_inverse(self, vector):
    # pylint: disable=invalid-name

    Z = utils.layer_params_to_mat2d(vector)

    # Derivations were done for "batch_dim==1" case so we need to convert to
    # that orientation:
    Z = array_ops.transpose(Z)

    if self._option == SeriesFBApproximation.option1:

      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)

      def gamma(x):
        # We are assuming that each case has the same number of time-steps.
        # If this stops being the case one shouldn't simply replace this T
        # with its average value.  Instead, one needs to go back to the
        # definition of the gamma function from the paper.
        T = self._num_timesteps
        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))

      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
      # Even though Y is Z-independent we are recomputing it from the psi's
      # each since Y depends on both A and G quantities, and it is relatively
      # cheap to compute.
      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)

      # Z = L_G^T * Z * L_A
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = U_G^T * Z * U_A
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)

      # Z = Z .* Y
      Z *= Y

      # Z = L_G * Z * L_A^T
      # This is equivalent to the following computation from the original
      # pseudo-code:
      # Z = U_G * Z * U_A^T
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))

    elif self._option == SeriesFBApproximation.option2:

      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
      P_G, K_G, mu_G = self._output_factor.get_option2quants(
          self._damping_output)

      # Our approach differs superficially from the pseudo-code in the paper
      # in order to reduce the total number of matrix-matrix multiplies.
      # In particular, the first three computations in the pseudo code are
      # Z = G0^(-1/2) * Z * A0^(-1/2)
      # Z = Z - hPsi_G^T * Z * hPsi_A
      # Z = E_G^T * Z * E_A
      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
      # the entire computation can be written as
      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
      # This final expression is computed by the following two lines:
      # Z = Z - P_G * Z * P_A^T
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
      # Z = K_G^T * Z * K_A
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)

      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
      # Be careful with the outer product.  We don't want to accidentally
      # make it an inner-product instead.
      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
      Z /= tmp

      # We now perform the transpose/reverse version of the operations
      # derived above, whose derivation from the original pseudo-code is
      # analgous.
      # Z = K_G * Z * K_A^T
      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))

      # Z = Z - P_G^T * Z * P_A
      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)

      # Z = normalize (1/E[T]) * Z
      # Note that this normalization is done because we compute the statistics
      # by averaging, not summing, over time. (And the gradient is presumably
      # summed over time, not averaged, and thus their scales are different.)
      Z /= math_ops.cast(self._num_timesteps, Z.dtype)

    # Convert back to the "batch_dim==0" orientation.
    Z = array_ops.transpose(Z)

    return utils.mat2d_to_layer_params(vector, Z)
Esempio n. 21
0
 def multiply_matpower(self, vector, exp):
   reshaped_vect = utils.layer_params_to_mat2d(vector)
   reshaped_out = self._factor.left_multiply_matpower(
       reshaped_vect, exp, self._damping_func)
   return utils.mat2d_to_layer_params(vector, reshaped_out)