def multiply_inverse(self, vector): left_factor_inv = self._input_factor.get_damped_inverse(self._input_damping) right_factor_inv = self._output_factor.get_damped_inverse( self._output_damping) reshaped_vector = utils.layer_params_to_mat2d(vector) reshaped_out = math_ops.matmul(left_factor_inv, math_ops.matmul(reshaped_vector, right_factor_inv)) if self._renorm_coeff != 1.0: reshaped_out /= math_ops.cast( self._renorm_coeff, dtype=reshaped_out.dtype) return utils.mat2d_to_layer_params(vector, reshaped_out)
def multiply(self, vector): left_factor = self._input_factor.get_cov() right_factor = self._output_factor.get_cov() reshaped_vector = utils.layer_params_to_mat2d(vector) reshaped_out = (math_ops.matmul(reshaped_vector, right_factor) + self._output_damping * reshaped_vector) reshaped_out = (math_ops.matmul(left_factor, reshaped_out) + self._input_damping * reshaped_out) if self._renorm_coeff != 1.0: reshaped_out *= math_ops.cast(self._renorm_coeff, dtype=reshaped_out.dtype) return utils.mat2d_to_layer_params(vector, reshaped_out)
def multiply(self, vector): """Approximate damped Fisher-vector product. Args: vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape [input_size, output_size] corresponding to layer's weights. If not, a 2-tuple of the former and a Tensor of shape [output_size] corresponding to the layer's bias. Returns: Tensor of the same shape, corresponding to the Fisher-vector product. """ reshaped_vect = utils.layer_params_to_mat2d(vector) reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping) return utils.mat2d_to_layer_params(vector, reshaped_out)
def multiply_inverse(self, vector): # pylint: disable=invalid-name Z = utils.layer_params_to_mat2d(vector) # Derivations were done for "batch_dim==1" case so we need to convert to # that orientation: Z = array_ops.transpose(Z) if self._option == SeriesFBApproximation.option1: # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G. L_A, psi_A = self._input_factor.get_option1quants(self._damping_input) L_G, psi_G = self._output_factor.get_option1quants(self._damping_output) def gamma(x): # We are assuming that each case has the same number of time-steps. # If this stops being the case one shouldn't simply replace this T # with its average value. Instead, one needs to go back to the # definition of the gamma function from the paper. T = self._num_timesteps return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T)) # Y = gamma( psi_G*psi_A^T ) (computed element-wise) # Even though Y is Z-independent we are recomputing it from the psi's # each since Y depends on both A and G quantities, and it is relatively # cheap to compute. Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A) # Z = L_G^T * Z * L_A # This is equivalent to the following computation from the original # pseudo-code: # Z = G0^(-1/2) * Z * A0^(-1/2) # Z = U_G^T * Z * U_A Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True) # Z = Z .* Y Z *= Y # Z = L_G * Z * L_A^T # This is equivalent to the following computation from the original # pseudo-code: # Z = U_G * Z * U_A^T # Z = G0^(-1/2) * Z * A0^(-1/2) Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True)) elif self._option == SeriesFBApproximation.option2: # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1), # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G. P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input) P_G, K_G, mu_G = self._output_factor.get_option2quants( self._damping_output) # Our approach differs superficially from the pseudo-code in the paper # in order to reduce the total number of matrix-matrix multiplies. # In particular, the first three computations in the pseudo code are # Z = G0^(-1/2) * Z * A0^(-1/2) # Z = Z - hPsi_G^T * Z * hPsi_A # Z = E_G^T * Z * E_A # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2) # the entire computation can be written as # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2) # - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A # = E_G^T * (G0^(-1/2) * Z * A0^(-1/2) # - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A # = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A # - E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A # = K_G^T * Z * K_A - K_G^T * P_G * Z * P_A^T * K_A # This final expression is computed by the following two lines: # Z = Z - P_G * Z * P_A^T Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True)) # Z = K_G^T * Z * K_A Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True) # Z = Z ./ (1*1^T - mu_G*mu_A^T) # Be careful with the outer product. We don't want to accidentally # make it an inner-product instead. tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A # Prevent some numerical issues by setting any 0.0 eigs to 1.0 tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype) Z /= tmp # We now perform the transpose/reverse version of the operations # derived above, whose derivation from the original pseudo-code is # analgous. # Z = K_G * Z * K_A^T Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True)) # Z = Z - P_G^T * Z * P_A Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True) # Z = normalize (1/E[T]) * Z # Note that this normalization is done because we compute the statistics # by averaging, not summing, over time. (And the gradient is presumably # summed over time, not averaged, and thus their scales are different.) Z /= math_ops.cast(self._num_timesteps, Z.dtype) # Convert back to the "batch_dim==0" orientation. Z = array_ops.transpose(Z) return utils.mat2d_to_layer_params(vector, Z)
def multiply(self, vector): reshaped_vect = utils.layer_params_to_mat2d(vector) reshaped_out = reshaped_vect * (self._factor.get_cov() + self._damping) return utils.mat2d_to_layer_params(vector, reshaped_out)