def testGradientFloat16(self): with self.test_session(use_gpu=True) as sess: # Randomly construct a 1D shape from [1, 40) shape = random_ops.random_uniform( [1], minval=1, maxval=40, dtype=dtypes.int32) # Construct the fp32 graph and its gradient. x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x") y1 = nn_ops.relu(x, name="relu_fp32") l1 = nn_ops.l2_loss(y1) dx_f32 = gradients_impl.gradients(l1, x) # Construct the fp16 graph and its gradient. # It starts with the same x, in fp32. But before it reaches Relu, it is # cast into fp16. So during backprop, the gradient computation is in fp16. x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast") y2 = nn_ops.relu(x2, name="relu_fp16") l2 = nn_ops.l2_loss(y2) dx_f16 = gradients_impl.gradients(l2, x) # Repeat the experiment for 100 times. All tensor shapes and its tensor # values are randomly generated for each run. for _ in xrange(100): dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16]) self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
def testGradientFloat16(self): with self.test_session(use_gpu=True) as sess: # Randomly construct a 1D shape from [1, 40) shape = random_ops.random_uniform([1], minval=1, maxval=40, dtype=dtypes.int32) # Construct the fp32 graph and its gradient. x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x") y1 = nn_ops.relu(x, name="relu_fp32") l1 = nn_ops.l2_loss(y1) dx_f32 = gradients_impl.gradients(l1, x) # Construct the fp16 graph and its gradient. # It starts with the same x, in fp32. But before it reaches Relu, it is # cast into fp16. So during backprop, the gradient computation is in fp16. x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast") y2 = nn_ops.relu(x2, name="relu_fp16") l2 = nn_ops.l2_loss(y2) dx_f16 = gradients_impl.gradients(l2, x) # Repeat the experiment for 100 times. All tensor shapes and its tensor # values are randomly generated for each run. for _ in xrange(100): dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16]) self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
def _testGradient(self, np_input, bias, dtype, data_format, use_gpu): with self.cached_session(use_gpu=use_gpu): if data_format == "NCHW": np_input = self._NHWCToNCHW(np_input) input_tensor = constant_op.constant(np_input, shape=np_input.shape, dtype=dtype) bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype) output_tensor = nn_ops.bias_add(input_tensor, bias_tensor, data_format=data_format) tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient( input_tensor, np_input.shape, output_tensor, np_input.shape) bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient( bias_tensor, bias.shape, output_tensor, np_input.shape) # Test gradient of BiasAddGrad bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, np_input.shape, bias_add_grad, bias.shape) if dtype == np.float16: # Compare fp16 theoretical gradients to fp32 numerical gradients, # since fp16 numerical gradients are too imprecise unless great # care is taken with choosing the inputs and the delta. This is # a weaker check (in particular, it does not test the op itself, # only its gradient), but it's much better than nothing. input_tensor = constant_op.constant(np_input, shape=np_input.shape, dtype=np.float32) bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=np.float32) output_tensor = nn_ops.bias_add(input_tensor, bias_tensor, data_format=data_format) _, tensor_jacob_n = gradient_checker.compute_gradient( input_tensor, np_input.shape, output_tensor, np_input.shape) _, bias_jacob_n = gradient_checker.compute_gradient( bias_tensor, bias.shape, output_tensor, np_input.shape) bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] _, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, np_input.shape, bias_add_grad, bias.shape) threshold = 2e-3 if dtype == dtypes.float64: threshold = 1e-10 self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold) self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold) self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
def _testGradient(self, np_input, bias, dtype, data_format, use_gpu): with self.test_session(use_gpu=use_gpu): if data_format == "NCHW": np_input = self._NHWCToNCHW(np_input) input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=dtype) bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient( input_tensor, np_input.shape, output_tensor, np_input.shape) bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient( bias_tensor, bias.shape, output_tensor, np_input.shape) # Test gradient of BiasAddGrad bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, np_input.shape, bias_add_grad, bias.shape) if dtype == np.float16: # Compare fp16 theoretical gradients to fp32 numerical gradients, # since fp16 numerical gradients are too imprecise unless great # care is taken with choosing the inputs and the delta. This is # a weaker check (in particular, it does not test the op itself, # only its gradient), but it's much better than nothing. input_tensor = constant_op.constant( np_input, shape=np_input.shape, dtype=np.float32) bias_tensor = constant_op.constant( bias, shape=bias.shape, dtype=np.float32) output_tensor = nn_ops.bias_add( input_tensor, bias_tensor, data_format=data_format) _, tensor_jacob_n = gradient_checker.compute_gradient(input_tensor, np_input.shape, output_tensor, np_input.shape) _, bias_jacob_n = gradient_checker.compute_gradient(bias_tensor, bias.shape, output_tensor, np_input.shape) bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] _, grad_jacob_n = gradient_checker.compute_gradient(output_tensor, np_input.shape, bias_add_grad, bias.shape) threshold = 2e-3 if dtype == dtypes.float64: threshold = 1e-10 self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold) # TODO(annarev): Re-add assertion for float16, float32 dtypes and NCHW # once we figure out why this check started failing with cuda mavx. if dtype == dtypes.float64 or data_format != "NCHW": self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold) self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
def testL2LossOp(self, tf_quantization_mode): root = tracking.AutoTrackable() root.l2_loss_func = def_function.function(lambda x: nn_ops.l2_loss(x)) # pylint: disable=unnecessary-lambda input_data = tf.range(4, dtype=tf.float32) concrete_func = root.l2_loss_func.get_concrete_function(input_data) converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func], root) converter._experimental_tf_quantization_mode = tf_quantization_mode tflite_model = converter.convert() self.assertTrue(tflite_model) self.assertIn('FlexL2Loss', tflite_test_util.get_ops_list(tflite_model)) # Check the model works. interpreter = Interpreter(model_content=tflite_model) interpreter.allocate_tensors() input_details = interpreter.get_input_details() test_input = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32) interpreter.set_tensor(input_details[0]['index'], test_input) interpreter.invoke() output_details = interpreter.get_output_details() expected_output = np.array([15.0], dtype=np.float32) output_data = interpreter.get_tensor(output_details[0]['index']) self.assertTrue((expected_output == output_data).all())
def testL2Loss(self): for dtype in [dtypes.float32, dtypes.float64]: x = constant_op.constant( [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype) l2loss = nn_ops.l2_loss(x) value = self.evaluate(l2loss) self.assertAllClose(7.0, value)
def testL2Loss(self): for dtype in [dtypes.float32, dtypes.float64]: with self.test_session(): x = constant_op.constant( [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype) l2loss = nn_ops.l2_loss(x) value = l2loss.eval() self.assertAllClose(7.0, value)
def _computeGradient(self, np_input, bias, dtype, data_format): input_shape = output_shape = np_input.shape bias_shape = bias.shape input_tensor = constant_op.constant(np_input, shape=input_shape, dtype=dtype) bias_tensor = constant_op.constant(bias, shape=bias_shape, dtype=dtype) if context.executing_eagerly(): def bias_add(input_tensor, bias_tensor): return nn_ops.bias_add(input_tensor, bias_tensor, data_format=data_format) # The following is a work-around for TF issue 33660. Instead of # calculating the analytical and numerical gradients for both # inputs in a single call to compute_gradient, compute_gradient # is called for each input separately. def bias_add_1(input_tensor): return bias_add(input_tensor, bias_tensor) def bias_add_2(bias_tensor): return bias_add(input_tensor, bias_tensor) input_jacob_a, input_jacob_n = gradient_checker_v2.compute_gradient( bias_add_1, [input_tensor]) bias_jacob_a, bias_jacob_n = gradient_checker_v2.compute_gradient( bias_add_2, [bias_tensor]) # Test gradient of BiasAddGrad def bias_add_grad_function(upstream_gradients): with backprop.GradientTape() as tape: tape.watch(bias_tensor) bias_add_output = bias_add(input_tensor, bias_tensor) gradient_injector_output = bias_add_output * upstream_gradients return tape.gradient(gradient_injector_output, bias_tensor) upstream_tensor = self._random_tensor(output_shape, dtype) grad_jacob_a, grad_jacob_n = gradient_checker_v2.compute_gradient( bias_add_grad_function, [upstream_tensor]) else: output_tensor = nn_ops.bias_add(input_tensor, bias_tensor, data_format=data_format) jacobians = gradient_checker.compute_gradient( [input_tensor, bias_tensor], [input_shape, bias_shape], output_tensor, output_shape) (input_jacob_a, input_jacob_n), (bias_jacob_a, bias_jacob_n) = jacobians # Test gradient of BiasAddGrad bias_add_grad = gradients_impl.gradients( nn_ops.l2_loss(output_tensor), bias_tensor)[0] grad_jacob_a, grad_jacob_n = gradient_checker.compute_gradient( output_tensor, output_shape, bias_add_grad, bias_shape) return ((input_jacob_a, bias_jacob_a, grad_jacob_a), (input_jacob_n, bias_jacob_n, grad_jacob_n))
def testGradient(self): x_shape = [20, 7, 3] np.random.seed(1) # Make it reproducible. x_val = np.random.random_sample(x_shape).astype(np.float64) with self.test_session(): x = constant_op.constant(x_val, name="x") output = nn_ops.l2_loss(x) err = gradient_checker.compute_gradient_error(x, x_shape, output, [1]) print("L2Loss gradient err = %g " % err) err_tolerance = 1e-11 self.assertLess(err, err_tolerance)
def testFlexWithAutomaticPassThrough(self): # Create a graph that has one L2Loss op. with ops.Graph().as_default(): with session.Session() as sess: in_tensor = array_ops.placeholder( shape=[4], dtype=dtypes.float32, name='input') out_tensor = nn_ops.l2_loss(in_tensor) converter = lite.TFLiteConverter.from_session(sess, [in_tensor], [out_tensor]) converter.target_spec.supported_ops = set([lite.OpsSet.SELECT_TF_OPS]) converter._experimental_allow_all_select_tf_ops = True tflite_model = converter.convert() self.assertTrue(tflite_model) self.assertIn('FlexL2Loss', tflite_test_util.get_ops_list(tflite_model))
def _center_loss(logit, labels, alpha, lam, num_classes, dtype=dtypes.float32): """ coumpute the center loss and update the centers, followed by 'A Discriminative Feature Learning Approach for Deep Face Recognition',ECCV 2016 :param logit: output of NN full connection layer, [batch_size, feature_dimension] tensor :param labels: true label of every sample, [batch_size] tensor without ont-hot :param alpha: learning rate about speed of updating, 0-1 float :param lam: center loss weight compared to softmax loss and others :param num_classes: classes numbers,int :return: loss: the computed center loss centers: tensor of all centers,[num_classes, feature_dimension] centers_update_op: should be running while training the model to update centers """ # get feature dimension fea_dimension = array_ops.shape(logit)[1] # initialize centers centers = variable_scope.get_variable( 'centers', [num_classes, fea_dimension], dtype=dtype, initializer=init_ops.constant_initializer(0), trainable=False) labels = array_ops.reshape(labels, [-1]) # get centers about current batch centers_batch = array_ops.gather(centers, labels) # compote l2 loss loss = nn_ops.l2_loss(logit - centers_batch) * lam # compute the difference between each sample and their corresponding center diff = centers_batch - logit # compute delta of corresponding center unique_label, unique_idx, unique_count = array_ops.unique_with_counts( labels) appear_times = array_ops.gather(unique_count, unique_idx) appear_times = array_ops.reshape(appear_times, [-1, 1]) delta_centers = diff / math_ops.cast(1 + appear_times, tf.float32) delta_centers = delta_centers * alpha # update centers center_update_op = state_ops.scatter_sub(centers, labels, delta_centers) return loss, centers, center_update_op
def BuildFullModel(): """Build the full model with conv,rnn,opt.""" seq = [] for i in range(4): with variable_scope.variable_scope('inp_%d' % i): seq.append(array_ops.reshape(BuildSmallModel(), [2, 1, -1])) cell = rnn_cell.BasicRNNCell(16) out = rnn.dynamic_rnn( cell, array_ops.concat(seq, axis=1), dtype=dtypes.float32)[0] target = array_ops.ones_like(out) loss = nn_ops.l2_loss(math_ops.reduce_mean(target - out)) sgd_op = gradient_descent.GradientDescentOptimizer(1e-2) return sgd_op.minimize(loss)
def global_norm(t_list, name=None): """Computes the global norm of multiple tensors. Given a tuple or list of tensors `t_list`, this operation returns the global norm of the elements in all tensors in `t_list`. The global norm is computed as: `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))` Any entries in `t_list` that are of type None are ignored. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. name: A name for the operation (optional). Returns: A 0-D (scalar) `Tensor` of type `float`. Raises: TypeError: If `t_list` is not a sequence. """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) with ops.op_scope(t_list, name, "global_norm") as name: values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list) ] half_squared_norms = [] for v in values: if v is not None: with ops.colocate_with(v): half_squared_norms.append(nn_ops.l2_loss(v)) half_squared_norm = math_ops.reduce_sum( array_ops.pack(half_squared_norms)) norm = math_ops.sqrt( half_squared_norm * constant_op.constant(2.0, dtype=half_squared_norm.dtype), name="global_norm") return norm
def global_norm(t_list, name=None): """Computes the global norm of multiple tensors. Given a tuple or list of tensors `t_list`, this operation returns the global norm of the elements in all tensors in `t_list`. The global norm is computed as: `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))` Any entries in `t_list` that are of type None are ignored. Args: t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. name: A name for the operation (optional). Returns: A 0-D (scalar) `Tensor` of type `float`. Raises: TypeError: If `t_list` is not a sequence. """ if (not isinstance(t_list, collections.Sequence) or isinstance(t_list, six.string_types)): raise TypeError("t_list should be a sequence") t_list = list(t_list) with ops.name_scope(name, "global_norm", t_list) as name: values = [ ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t_%d" % i) if t is not None else t for i, t in enumerate(t_list)] half_squared_norms = [] for v in values: if v is not None: with ops.colocate_with(v): half_squared_norms.append(nn_ops.l2_loss(v)) half_squared_norm = math_ops.reduce_sum(array_ops.pack(half_squared_norms)) norm = math_ops.sqrt( half_squared_norm * constant_op.constant(2.0, dtype=half_squared_norm.dtype), name="global_norm") return norm
def grad(x): with backprop.GradientTape() as tape: tape.watch(x) y = nn_ops.l2_loss(nn_ops.relu(x)) return tape.gradient(y, x)
def __init__(self, sess, num_units, num_actions, batch_size, learning_rate, tau, gamma, activation=None, batch_norm=True, opitmizer_name="adam", max_gradient_norm=5.0, critic_scope=None, target_scope=None): if activation and not callable(activation): raise TypeError("Expected `activation` to be callable") self._sess = sess self._num_units = num_units self._num_actions = num_actions self._learning_rate = learning_rate self._tau = tau self._gamma = gamma self._activation = activation self._batch_norm = batch_norm self._critic_scope = critic_scope self._target_scope = target_scope # Create the critic network (critic_inputs, critic_actions, critic_outputs, critic_params, critic_normalized_inputs) = self._build_network(critic_scope) # Target Network (target_inputs, target_actions, target_outputs, target_params, target_normalized_inputs) = self._build_network(target_scope) # Op for periodically updating target network # with online network weights target_update_op = update_target_params( params=critic_params, target_params=target_params, tau=tau) # Network target (y_i) # y_t = reward_t + gamma Q"(s_t+1, U"(s_t+1)) TD_target = array_ops.placeholder( dtype=dtypes.float32, shape=[None, 1], name="TD_target") # Define loss and optimization Op # Loss = Sum( (y_t - Q(s_t, a_t))^2 ) critic_loss = nn_ops.l2_loss(TD_target - critic_outputs) critic_loss = math_ops.div(critic_loss, batch_size) # Gradients critic_gradients = gradients_impl.gradients(critic_loss, critic_params) # clip gradients if max_gradient_norm: critic_gradients, _, _ = tf_utils.gradient_clip( critic_gradients, max_gradient_norm=max_gradient_norm) # optimization optimizer = create_optimizer(opitmizer_name, learning_rate) optimization_op = optimizer.apply_gradients( zip(critic_gradients, critic_params)) # Get the gradient of the net w.r.t. the action. # For each action in the minibatch (i.e., for each x in xs), # this will sum up the gradients of each critic output in the minibatch # w.r.t. that action. Each output is independent of all # actions except for one. action_grads = gradients_impl.gradients(critic_outputs, critic_actions) self._critic_params = critic_params self._critic_inputs = critic_inputs self._critic_actions = critic_actions self._critic_outputs = critic_outputs self._critic_normalized_inputs = critic_normalized_inputs self._target_params = target_params self._target_inputs = target_inputs self._target_actions = target_actions self._target_outputs = target_outputs self._target_normalized_inputs = target_normalized_inputs self._TD_target = TD_target self._critic_loss = critic_loss self._target_update_op = target_update_op self._optimization_op = optimization_op self._action_grads = action_grads
def _build_net(self): """ Build the network for test. """ # network define x = constant_op.constant(1.0, shape=[1, 28, 28, 3], dtype=dtypes.float32) is_training = ops.convert_to_tensor(True) # conv1 x = self._conv2d(x, 'conv1', self.conv1_kernel, self.conv1_bias) x = self._batchnorm(x, 'bn1', self.bn1_gamma, self.bn1_beta, \ self.bn1_moving_mean, self.bn1_moving_variance, \ is_training) x = nn_ops.relu(x) # conv2a residual_x = self._conv2d(x, 'conv2a_1', self.conv2a_1_kernel, self.conv2a_1_bias) residual_x = self._batchnorm(residual_x, 'bn2a_1', self.bn2a_1_gamma, self.bn2a_1_beta, self.bn2a_1_moving_mean, self.bn2a_1_moving_variance, is_training) residual_x = nn_ops.relu(residual_x) card_in = gap_finetune.split(residual_x, num_or_size_splits=4, axis=-1, gap=self.gap, gap_vars=self.gap_vars) card_out = [] for i in range(4): out = self._conv2d(card_in[i], 'conv2a_2_%d' % i, self.conv2a_2_kernel[i], self.conv2a_2_bias[i]) card_out.append(out) residual_x = array_ops.concat(card_out, axis=-1) residual_x = self._batchnorm(residual_x, 'bn2a_2', self.bn2a_2_gamma, self.bn2a_2_beta, self.bn2a_2_moving_mean, self.bn2a_2_moving_variance, is_training) residual_x = nn_ops.relu(residual_x) residual_x = self._conv2d(residual_x, 'conv2a_3', self.conv2a_3_kernel, self.conv2a_3_bias) residual_x = self._batchnorm(residual_x, 'bn2a_3', self.bn2a_3_gamma, self.bn2a_3_beta, self.bn2a_3_moving_mean, self.bn2a_3_moving_variance, is_training) # conv2b shortcut_x = self._conv2d(x, 'conv2b', self.conv2b_kernel, self.conv2b_bias) shortcut_x = self._batchnorm(shortcut_x, 'bn2b', self.bn2b_gamma, self.bn2b_beta, self.bn2b_moving_mean, self.bn2b_moving_variance, is_training) x = nn_ops.relu(residual_x + shortcut_x) # conv3 x = self._conv2d(x, 'conv3', self.conv3_kernel, self.conv3_bias) x = self._batchnorm(x, 'bn3', self.bn3_gamma, self.bn3_beta, \ self.bn3_moving_mean, self.bn3_moving_variance,\ is_training) conv_output = nn_ops.relu(x) # loss self.loss = nn_ops.l2_loss(conv_output)
def l2norm_squared(v): return constant_op.constant(2, dtype=v.dtype.base_dtype) * nn_ops.l2_loss(v)