Ejemplo n.º 1
0
    def test_interpolation_gradient(self):
        """Make sure that backprop can run. Correctness of gradients is assumed.

    Here, we create a use a small 'training' set and a more densely-sampled
    set of query points, for which we know the true value in advance. The goal
    is to choose x locations for the training data such that interpolating using
    this training data yields the best reconstruction for the function
    values at the query points. The training data locations are optimized
    iteratively using gradient descent.
    """
        tp = _QuadraticPlusSinProblemND()
        (query_points, query_values, train_points,
         train_values) = tp.get_problem(optimizable=True)

        regularization = 0.001
        for interpolation_order in (1, 2, 3, 4):
            interpolator = interpolate_spline.interpolate_spline(
                train_points, train_values, query_points, interpolation_order,
                regularization)

            loss = math_ops.reduce_mean(
                math_ops.square(query_values - interpolator))

            optimizer = momentum.MomentumOptimizer(0.001, 0.9)
            grad = gradients.gradients(loss, [train_points])
            grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
            opt_func = optimizer.apply_gradients(zip(grad, [train_points]))
            init_op = variables.global_variables_initializer()

            with self.test_session() as sess:
                sess.run(init_op)
                for _ in range(100):
                    sess.run([loss, opt_func])
Ejemplo n.º 2
0
def train_step(model,
               optimizer,
               dataset,
               step_counter,
               ep,
               class_weights,
               params,
               log_interval=None):

    start = time.time()
    steps = 0
    total_loss = 0
    # params
    for step, ((days, prices, day_lens, news_lens),
               labels) in enumerate(dataset):
        steps += 1
        with tf.GradientTape() as tape:
            logits = model(days, prices, day_lens, news_lens, training=True)
            loss_value = loss(logits, labels, class_weights)
            total_loss += loss_value

        grads = tape.gradient(loss_value, model.trainable_weights)
        grads, _ = clip_ops.clip_by_global_norm(grads, params['clip_norm'])

        optimizer.apply_gradients(zip(grads, model.trainable_weights),
                                  global_step=step_counter)

    params['train_losses'].append(total_loss / steps)
  def testThatBackpropRuns(self):
    """Run optimization to ensure that gradients can be computed."""

    batch_size = 1
    image_height = 9
    image_width = 12
    image = variables.Variable(
        np.float32(
            np.random.uniform(size=[batch_size, image_height, image_width, 3])))
    control_point_locations = [[3., 3.]]
    control_point_locations = constant_op.constant(
        np.float32(np.expand_dims(control_point_locations, 0)))
    control_point_displacements = [[0.25, -0.5]]
    control_point_displacements = constant_op.constant(
        np.float32(np.expand_dims(control_point_displacements, 0)))
    warped_image, _ = sparse_image_warp.sparse_image_warp(
        image,
        control_point_locations,
        control_point_locations + control_point_displacements,
        num_boundary_points=3)

    loss = math_ops.reduce_mean(math_ops.abs(warped_image - image))
    optimizer = momentum.MomentumOptimizer(0.001, 0.9)
    grad = gradients.gradients(loss, [image])
    grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
    opt_func = optimizer.apply_gradients(zip(grad, [image]))
    init_op = variables.global_variables_initializer()

    with self.test_session() as sess:
      sess.run(init_op)
      for _ in range(5):
        sess.run([loss, opt_func])
Ejemplo n.º 4
0
    def _get_train_ops(self, features, targets):
        """See base class."""
        global_step = contrib_variables.get_global_step()
        assert global_step
        logits = self._logits(features, is_training=True)
        with ops.control_dependencies([
                self._centered_bias_step(targets,
                                         self._get_weight_tensor(features))
        ]):
            loss = self._loss(logits, targets,
                              self._get_weight_tensor(features))
        logging_ops.scalar_summary("loss", loss)

        linear_vars = self._get_linear_vars()
        dnn_vars = self._get_dnn_vars()
        grads = gradients.gradients(loss, dnn_vars + linear_vars)
        if self._gradient_clip_norm:
            grads, _ = clip_ops.clip_by_global_norm(grads,
                                                    self._gradient_clip_norm)

        dnn_grads = grads[0:len(dnn_vars)]
        linear_grads = grads[len(dnn_vars):]

        train_ops = self._get_linear_training_ops(
            linear_grads, linear_vars) + self._get_dnn_training_ops(
                dnn_grads, dnn_vars)

        train_step = control_flow_ops.group(*train_ops,
                                            name="combined_training_op")
        with ops.control_dependencies([train_step]):
            with ops.get_default_graph().colocate_with(global_step):
                return state_ops.assign_add(global_step, 1).op, loss
Ejemplo n.º 5
0
    def testThatBackpropRuns(self):
        """Run optimization to ensure that gradients can be computed."""

        batch_size = 1
        image_height = 9
        image_width = 12
        image = variables.Variable(
            np.float32(
                np.random.uniform(
                    size=[batch_size, image_height, image_width, 3])))
        control_point_locations = [[3., 3.]]
        control_point_locations = constant_op.constant(
            np.float32(np.expand_dims(control_point_locations, 0)))
        control_point_displacements = [[0.25, -0.5]]
        control_point_displacements = constant_op.constant(
            np.float32(np.expand_dims(control_point_displacements, 0)))
        warped_image, _ = sparse_image_warp.sparse_image_warp(
            image,
            control_point_locations,
            control_point_locations + control_point_displacements,
            num_boundary_points=3)

        loss = math_ops.reduce_mean(math_ops.abs(warped_image - image))
        optimizer = momentum.MomentumOptimizer(0.001, 0.9)
        grad = gradients.gradients(loss, [image])
        grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
        opt_func = optimizer.apply_gradients(zip(grad, [image]))
        init_op = variables.global_variables_initializer()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(5):
                sess.run([loss, opt_func])
Ejemplo n.º 6
0
def gradients(opt, loss, vars, step, max_gradient_norm=None, dont_clip=[]):
    '''
    Function for calculating and applying gradients on all trainable parameters
    '''
    gradients = opt.compute_gradients(loss, vars)
    if max_gradient_norm is not None:
        to_clip = [(g, v) for g, v in gradients if v.name not in dont_clip]
        not_clipped = [(g, v) for g, v in gradients if v.name in dont_clip]
        gradients, variables = zip(*to_clip)
        clipped_gradients, _ = clip_ops.clip_by_global_norm(
            gradients, max_gradient_norm)
        gradients = list(zip(clipped_gradients, variables)) + not_clipped

    # Add histograms for variables, gradients and gradient norms
    for gradient, variable in gradients:
        if isinstance(gradient, ops.IndexedSlices):
            grad_values = gradient.values
        else:
            grad_values = gradient
        if grad_values is None:
            print('warning: missing gradient: {}'.format(variable.name))
        if grad_values is not None:
            tf.summary.histogram(variable.name, variable)
            tf.summary.histogram(variable.name + '/gradients', grad_values)
            tf.summary.histogram(variable.name + '/gradient_norm',
                                 clip_ops.global_norm([grad_values]))

    return opt.apply_gradients(gradients, global_step=step)
Ejemplo n.º 7
0
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
    """Clips gradients of a multitask loss by their global norm.

  Ignores all-zero tensors when computing the global norm.

  Args:
    gradients_variables: a list of pairs (gradient, variable).
    clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.

  Returns:
    list: A list of pairs of the same type as gradients_variables,.
    fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
  """
    gradients, variables = six.moves.zip(*gradients_variables)

    def _replace_nonexisting_grad(grad):
        if grad is None:
            return grad
        all_zeros = _is_all_zeros(grad)
        return control_flow_ops.cond(
            all_zeros,
            lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
            lambda: grad)

    nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
    fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
    gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                clip_norm,
                                                use_norm=fixed_global_norm)
    return list(six.moves.zip(gradients, variables)), fixed_global_norm
Ejemplo n.º 8
0
  def test_interpolation_gradient(self):
    """Make sure that backprop can run. Correctness of gradients is assumed.

    Here, we create a use a small 'training' set and a more densely-sampled
    set of query points, for which we know the true value in advance. The goal
    is to choose x locations for the training data such that interpolating using
    this training data yields the best reconstruction for the function
    values at the query points. The training data locations are optimized
    iteratively using gradient descent.
    """
    tp = _QuadraticPlusSinProblemND()
    (query_points, query_values, train_points,
     train_values) = tp.get_problem(optimizable=True)

    regularization = 0.001
    for interpolation_order in (1, 2, 3, 4):
      interpolator = interpolate_spline.interpolate_spline(
          train_points, train_values, query_points, interpolation_order,
          regularization)

      loss = math_ops.reduce_mean(math_ops.square(query_values - interpolator))

      optimizer = momentum.MomentumOptimizer(0.001, 0.9)
      grad = gradients.gradients(loss, [train_points])
      grad, _ = clip_ops.clip_by_global_norm(grad, 1.0)
      opt_func = optimizer.apply_gradients(zip(grad, [train_points]))
      init_op = variables.global_variables_initializer()

      with self.cached_session() as sess:
        sess.run(init_op)
        for _ in range(100):
          sess.run([loss, opt_func])
Ejemplo n.º 9
0
def train(model,
          optimizer,
          dataset,
          step_counter,
          ep,
          class_weights,
          log_interval=None):
    """Trains model on `dataset` using `optimizer`."""

    start = time.time()
    for step, ((days, day_lens, news_lens), labels) in enumerate(dataset):
        with tf.contrib.summary.record_summaries_every_n_global_steps(
                50, global_step=step_counter):
            # Record the operations used to compute the loss given the input,
            # so that the gradient of the loss with respect to the variables
            # can be computed.
            with tf.GradientTape() as tape:
                logits = model(days, day_lens, news_lens, training=True)
                loss_value = loss(logits, labels, class_weights)
                tf.contrib.summary.scalar('loss', loss_value)
                tf.contrib.summary.scalar('accuracy',
                                          compute_accuracy(logits, labels))
            grads = tape.gradient(loss_value, model.trainable_weights)
            grads, _ = clip_ops.clip_by_global_norm(grads,
                                                    model.flags.clip_norm)
            optimizer.apply_gradients(zip(grads, model.trainable_weights),
                                      global_step=step_counter)
            if log_interval and (step + 1) % log_interval == 0:
                rate = log_interval / (time.time() - start)
                print('Epoch #%d\tStep #%d\tLoss: %.6f (%.1f steps/sec)' %
                      (ep + 1, step, loss_value, rate))
                start = time.time()

            if ep == 0 and step == 0:
                print('#trainable_params', get_num_trainable_params(model))
Ejemplo n.º 10
0
    def _get_train_ops(self, features, targets):
        """See base class."""
        global_step = contrib_variables.get_global_step()
        assert global_step
        logits = self._logits(features, is_training=True)
        if self._enable_centered_bias:
            centered_bias_step = [self._centered_bias_step(targets, features)]
        else:
            centered_bias_step = []
        with ops.control_dependencies(centered_bias_step):
            loss = self._loss(logits, targets, features)
        logging_ops.scalar_summary("loss", loss)

        linear_vars = self._get_linear_vars()
        dnn_vars = self._get_dnn_vars()
        grads = gradients.gradients(loss, dnn_vars + linear_vars)
        if self._gradient_clip_norm:
            grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm)

        dnn_grads = grads[0 : len(dnn_vars)]
        linear_grads = grads[len(dnn_vars) :]

        train_ops = self._get_linear_training_ops(linear_grads, linear_vars) + self._get_dnn_training_ops(
            dnn_grads, dnn_vars
        )

        train_step = control_flow_ops.group(*train_ops, name="combined_training_op")
        with ops.control_dependencies([train_step]):
            with ops.get_default_graph().colocate_with(global_step):
                return state_ops.assign_add(global_step, 1).op, loss
Ejemplo n.º 11
0
    def apply_update(self, optimizer, grads_and_vars):
        (grads, vars) = zip(*grads_and_vars)

        # Gradient clipping
        if CustomTrainer.GRADIENT_CLIP in self.train_hypers:
            grads, global_norm = clip_ops.clip_by_global_norm(
                grads, self.train_hypers[CustomTrainer.GRADIENT_CLIP])
        # Gradient noise
        if CustomTrainer.GRADIENT_NOISE in self.train_hypers:
            sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE]
            if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers:
                sigma_sqr /= tf.pow(
                    1.0 + tf.to_float(self.global_step),
                    self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY])
            grads_tmp = []
            for g in grads:
                if g is not None:
                    noisy_grad = g + tf.sqrt(sigma_sqr) * tf.random_normal(
                        tf.shape(g))
                    grads_tmp.append(noisy_grad)
                else:
                    grads_tmp.append(g)
            grads = grads_tmp

        train_op = optimizer.apply_gradients(zip(grads, vars),
                                             global_step=self.global_step)
        return train_op
def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
  """Clips gradients of a multitask loss by their global norm.
  Ignores all-zero tensors when computing the global norm.

  Args:
  gradients_variables: a list of pairs (gradient, variable).
  clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.

  Returns:
  list: A list of pairs of the same type as gradients_variables,.
  fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
  """
  gradients, variables = six.moves.zip(*gradients_variables)
  def _replace_nonexisting_grad(grad):
    if grad is None:
      return grad
    all_zeros = _is_all_zeros(grad)
    return control_flow_ops.cond(all_zeros,
                                 lambda: array_ops.zeros(
                                     [], dtype=dtypes.as_dtype(grad.dtype)),
                                 lambda: grad)
  nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
  fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
  gradients, _ = clip_ops.clip_by_global_norm(gradients, clip_norm,
                                              use_norm=fixed_global_norm)
  return list(six.moves.zip(gradients, variables)), fixed_global_norm
Ejemplo n.º 13
0
 def _train_op_fn(loss):
   global_step = training_util.get_global_step()
   my_vars = ops.get_collection(parent_scope)
   grads = gradients.gradients(loss, my_vars)
   if gradient_clip_norm:
     grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
   return (_get_optimizer(optimizer).apply_gradients(
       zip(grads, my_vars), global_step=global_step))
Ejemplo n.º 14
0
 def _process_gradients(self, gradients_vars):
     """Process gradients (e.g. clipping) before applying them to weights."""
     with ops.name_scope('process_gradients'):
         gradients, variables = zip(*gradients_vars)
         if self._gradient_clipping_norm is not None:
             gradients, _ = clip_ops.clip_by_global_norm(
                 gradients, self._gradient_clipping_norm)
         return zip(gradients, variables)
Ejemplo n.º 15
0
 def _train_op_fn(loss):
   global_step = contrib_variables.get_global_step()
   my_vars = ops.get_collection(parent_scope)
   grads = gradients.gradients(loss, my_vars)
   if gradient_clip_norm:
     grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
   return (_get_optimizer(optimizer).apply_gradients(
       zip(grads, my_vars), global_step=global_step))
 def _process_gradients(self, gradients_vars):
   """Process gradients (e.g. clipping) before applying them to weights."""
   with ops.name_scope('process_gradients'):
     gradients, variables = zip(*gradients_vars)
     if self._gradient_clipping_norm is not None:
       gradients, _ = clip_ops.clip_by_global_norm(
           gradients, self._gradient_clipping_norm)
     return zip(gradients, variables)
Ejemplo n.º 17
0
 def _train_op_fn(loss):
   global_step = contrib_variables.get_global_step()
   my_vars = ops.get_collection("linear")
   grads = gradients.gradients(loss, my_vars)
   if gradient_clip_norm:
     grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
   return (_get_optimizer(optimizer).apply_gradients(
       zip(grads, my_vars), global_step=global_step))
Ejemplo n.º 18
0
 def testClipByGlobalNormPreservesDenseShape(self):
     dense_shape = (1, )
     slices = ops.IndexedSlices(constant_op.constant([1.0]),
                                constant_op.constant([0]),
                                dense_shape=dense_shape)
     ans, _ = clip_ops.clip_by_global_norm([slices], 1.0)
     modified_slices = ans[0]
     self.assertEqual(dense_shape, slices.dense_shape)
     self.assertEqual(dense_shape, modified_slices.dense_shape)
Ejemplo n.º 19
0
 def testClipByGlobalNormPreservesDenseShape(self):
   dense_shape = (1,)
   slices = ops.IndexedSlices(
       constant_op.constant([1.0]),
       constant_op.constant([0]),
       dense_shape=dense_shape)
   ans, _ = clip_ops.clip_by_global_norm([slices], 1.0)
   modified_slices = ans[0]
   self.assertEqual(dense_shape, slices.dense_shape)
   self.assertEqual(dense_shape, modified_slices.dense_shape)
Ejemplo n.º 20
0
    def __init__(self,
                 loss,
                 global_step,
                 optimizer,
                 learning_rate,
                 clip_gradients=5.0):
        """Build a trainer part of graph.

        Args:
          loss: Tensor that evaluates to model's loss.
          global_step: Tensor with global step of the model.
          optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class.
          learning_rate: If this is constant float value, no decay function is used.
                         Instead, a customized decay function can be passed that accepts
                         global_step as parameter and returns a Tensor.
                         e.g. exponential decay function:
                         def exp_decay(global_step):
                            return tf.train.exponential_decay(
                                learning_rate=0.1, global_step=global_step,
                                decay_steps=2, decay_rate=0.001)
        Raises:
            ValueError: if learning_rate is not a float or a callable.
        """
        self.loss = loss
        self.global_step = global_step
        # pylint: disable=redefined-variable-type
        if isinstance(learning_rate, float):
            self._learning_rate = vs.get_variable(
                "learning_rate", [],
                initializer=init_ops.constant_initializer(learning_rate))
        elif callable(learning_rate):
            self._learning_rate = learning_rate(self.global_step)
        else:
            raise ValueError(
                "learning_rate should be a float or a callable function.")
        params = variables.trainable_variables()
        self.gradients = gradients.gradients(loss, params)
        if clip_gradients > 0.0:
            self.gradients, self.gradients_norm = clip_ops.clip_by_global_norm(
                self.gradients, clip_gradients)
        grads_and_vars = zip(self.gradients, params)
        if isinstance(optimizer, str):
            self._optimizer = OPTIMIZER_CLS_NAMES[optimizer](
                self._learning_rate)
        else:
            self._optimizer = optimizer(self._learning_rate)
        self.trainer = self._optimizer.apply_gradients(grads_and_vars,
                                                       global_step=global_step,
                                                       name="train")
        # Update ops during training, e.g. batch_norm_ops
        self.trainer = control_flow_ops.group(
            self.trainer, *ops.get_collection('update_ops'))
        # Get all initializers for all trainable variables.
        self._initializers = variables.initialize_all_variables()
Ejemplo n.º 21
0
    def gradient_clipnorm_fn(grads_and_vars):

        if isinstance(distribute_ctx.get_strategy(),
                      central_storage_strategy.CentralStorageStrategy):
            raise ValueError(
                "`global_clipnorm` is not supported with `CenteralStorageStrategy`"
            )

        grads, variables = zip(*grads_and_vars)
        clipped_grads, _ = clip_ops.clip_by_global_norm(grads, clipnorm)
        clipped_grads_and_vars = list(zip(clipped_grads, variables))
        return clipped_grads_and_vars
Ejemplo n.º 22
0
  def testClipByGlobalNormInf(self):
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
                                shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        self.evaluate(norm)
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        ans[0].eval()
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        ans[1].eval()
Ejemplo n.º 23
0
  def testClipByGlobalNormInf(self):
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
                                shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        self.evaluate(norm)
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        ans[0].eval()
      with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
        ans[1].eval()
Ejemplo n.º 24
0
    def __init__(self, loss, global_step, optimizer,
                 learning_rate, clip_gradients=5.0):
        """Build a trainer part of graph.

        Args:
          loss: Tensor that evaluates to model's loss.
          global_step: Tensor with global step of the model.
          optimizer: Name of the optimizer class (SGD, Adam, Adagrad) or class.
          learning_rate: If this is constant float value, no decay function is used.
                         Instead, a customized decay function can be passed that accepts
                         global_step as parameter and returns a Tensor.
                         e.g. exponential decay function:
                         def exp_decay(global_step):
                            return tf.train.exponential_decay(
                                learning_rate=0.1, global_step=global_step,
                                decay_steps=2, decay_rate=0.001)
        Raises:
            ValueError: if learning_rate is not a float or a callable.
        """
        self.loss = loss
        self.global_step = global_step
        # pylint: disable=redefined-variable-type
        if isinstance(learning_rate, float):
            self._learning_rate = vs.get_variable(
                "learning_rate",
                [],
                initializer=init_ops.constant_initializer(learning_rate))
        elif callable(learning_rate):
            self._learning_rate = learning_rate(self.global_step)
        else:
            raise ValueError("learning_rate should be a float or a callable function.")
        params = variables.trainable_variables()
        self.gradients = gradients.gradients(loss, params)
        if clip_gradients > 0.0:
            self.gradients, self.gradients_norm = clip_ops.clip_by_global_norm(
                self.gradients, clip_gradients)
        grads_and_vars = zip(self.gradients, params)
        if isinstance(optimizer, str):
            self._optimizer = OPTIMIZER_CLS_NAMES[
                optimizer](self._learning_rate)
        else:
            self._optimizer = optimizer(self._learning_rate)
        self.trainer = self._optimizer.apply_gradients(grads_and_vars,
                                                       global_step=global_step,
                                                       name="train")
        # Update ops during training, e.g. batch_norm_ops
        self.trainer = control_flow_ops.group(self.trainer, *ops.get_collection('update_ops'))
        # Get all initializers for all trainable variables.
        self._initializers = variables.initialize_all_variables()
Ejemplo n.º 25
0
  def testClipByGlobalNormInf(self):
    # Expect all NaNs when global norm is inf.
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
                                shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = self.evaluate(norm)
      self.assertAllEqual(tf_norm, float('inf'))
      self.assertAllEqual(tf_ans_1, np.full([2, 3], float('nan')))
      self.assertAllEqual(tf_ans_2, np.full([2], float('nan')))
Ejemplo n.º 26
0
  def testClipByGlobalNormInf(self):
    # Expect all NaNs when global norm is inf.
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
                                shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = self.evaluate(norm)
      self.assertAllEqual(tf_norm, float('inf'))
      self.assertAllEqual(tf_ans_1, np.full([2, 3], float('nan')))
      self.assertAllEqual(tf_ans_2, np.full([2], float('nan')))
Ejemplo n.º 27
0
  def get_train_step(self, loss):
    """Returns the ops to run to perform a training step on this estimator.

    Args:
      loss: The loss to use when calculating gradients.

    Returns:
      The ops to run to perform a training step.
    """
    my_vars = self._get_vars()
    if not (self._get_feature_columns() or my_vars):
      return []

    grads = gradients.gradients(loss, my_vars)
    if self._gradient_clip_norm:
      grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm)
    return [self._get_optimizer().apply_gradients(zip(grads, my_vars))]
Ejemplo n.º 28
0
  def get_train_step(self, loss):
    """Returns the ops to run to perform a training step on this estimator.

    Args:
      loss: The loss to use when calculating gradients.

    Returns:
      The ops to run to perform a training step.
    """
    my_vars = self._get_vars()
    if not (self._get_feature_columns() or my_vars):
      return []

    grads = gradients.gradients(loss, my_vars)
    if self._gradient_clip_norm:
      grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm)
    return [self._get_optimizer().apply_gradients(zip(grads, my_vars))]
Ejemplo n.º 29
0
  def testClipByGlobalNormZero(self):
    # No norm clipping when norm = 0
    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([0.0, 0.0])
      # Norm = 0, no changes
      np_ans_0 = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
      np_ans_1 = [0.0, 0.0]
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = norm.eval()

    self.assertAllClose(tf_norm, 0.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 30
0
  def testClipByGlobalNormZero(self):
    # No norm clipping when norm = 0
    with self.session(use_gpu=True):
      x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([0.0, 0.0])
      # Norm = 0, no changes
      np_ans_0 = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
      np_ans_1 = [0.0, 0.0]
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = self.evaluate(norm)

    self.assertAllClose(tf_norm, 0.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 31
0
  def testClipByGlobalNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      np_ans_0 = [[-2.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
      np_ans_1 = [1.0, -2.0]
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = self.evaluate(norm)

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 32
0
  def testClipByGlobalNormNotClipped(self):
    # No norm clipping when clip_norm >= 5
    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      np_ans_0 = [[-2.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
      np_ans_1 = [1.0, -2.0]
      clip_norm = 6.0

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = norm.eval()

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 33
0
  def testClipByGlobalNormClippedTensor(self):
    # Norm clipping when clip_norm < 5
    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      clip_norm = constant_op.constant(4.0)

      # Answers are the original tensors scaled by 4.0/5.0
      np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
      np_ans_1 = [0.8, -1.6]

      ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = norm.eval()

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 34
0
  def testClipByGlobalNormClippedTensor(self):
    # Norm clipping when clip_norm < 5
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      clip_norm = constant_op.constant(4.0)

      # Answers are the original tensors scaled by 4.0/5.0
      np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
      np_ans_1 = [0.8, -1.6]

      ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].eval()
      tf_norm = self.evaluate(norm)

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 35
0
  def testClipByGlobalNormWithIndexedSlicesClipped(self):
    # Norm clipping when clip_norm < 5
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = ops.IndexedSlices(
          constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      clip_norm = 4.0

      # Answers are the original tensors scaled by 4.0/5.0
      np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
      np_ans_1 = [0.8, -1.6]

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].values.eval()
      tf_norm = self.evaluate(norm)

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
  def testClipByGlobalNormWithIndexedSlicesClipped(self):
    # Norm clipping when clip_norm < 5
    with self.test_session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = ops.IndexedSlices(
          constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      clip_norm = 4.0

      # Answers are the original tensors scaled by 4.0/5.0
      np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
      np_ans_1 = [0.8, -1.6]

      ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[1].values.eval()
      tf_norm = norm.eval()

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 37
0
  def testClipByGlobalNormSupportsNone(self):
    # Norm clipping when clip_norm < 5
    with self.session(use_gpu=True):
      x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
      x1 = constant_op.constant([1.0, -2.0])
      # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
      clip_norm = 4.0

      # Answers are the original tensors scaled by 4.0/5.0
      np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
      np_ans_1 = [0.8, -1.6]

      ans, norm = clip_ops.clip_by_global_norm((x0, None, x1, None), clip_norm)
      self.assertTrue(ans[1] is None)
      self.assertTrue(ans[3] is None)
      tf_ans_1 = ans[0].eval()
      tf_ans_2 = ans[2].eval()
      tf_norm = self.evaluate(norm)

    self.assertAllClose(tf_norm, 5.0)
    self.assertAllClose(np_ans_0, tf_ans_1)
    self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 38
0
 def apply_update(self, optimizer, grads_and_vars):
     (grads, vars) = zip(*grads_and_vars)
     
     # Gradient clipping
     if CustomTrainer.GRADIENT_CLIP in self.train_hypers:
         grads, global_norm = clip_ops.clip_by_global_norm(grads,
                                 self.train_hypers[CustomTrainer.GRADIENT_CLIP])
     # Gradient noise
     if CustomTrainer.GRADIENT_NOISE in self.train_hypers:
         sigma_sqr = self.train_hypers[CustomTrainer.GRADIENT_NOISE]
         if CustomTrainer.GRADIENT_NOISE_DECAY in self.train_hypers:
             sigma_sqr /= tf.pow(1.0 + tf.to_float(self.global_step),
                                 self.train_hypers[CustomTrainer.GRADIENT_NOISE_DECAY])
         grads_tmp = []
         for g in grads:
             if g is not None:
                 noisy_grad = g + tf.sqrt(sigma_sqr)*tf.random_normal(tf.shape(g))
                 grads_tmp.append(noisy_grad)
             else:
                 grads_tmp.append(g)
         grads = grads_tmp
         
     train_op = optimizer.apply_gradients(zip(grads, vars), global_step=self.global_step)
     return train_op
Ejemplo n.º 39
0
    def testClipByGlobalNormSupportsNone(self):
        # Norm clipping when clip_norm < 5
        with self.test_session():
            x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0],
                                      shape=[2, 3])
            x1 = constant_op.constant([1.0, -2.0])
            # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
            clip_norm = 4.0

            # Answers are the original tensors scaled by 4.0/5.0
            np_ans_0 = [[-1.6, 0.0, 0.0], [3.2, 0.0, 0.0]]
            np_ans_1 = [0.8, -1.6]

            ans, norm = clip_ops.clip_by_global_norm((x0, None, x1, None),
                                                     clip_norm)
            self.assertTrue(ans[1] is None)
            self.assertTrue(ans[3] is None)
            tf_ans_1 = ans[0].eval()
            tf_ans_2 = ans[2].eval()
            tf_norm = norm.eval()

        self.assertAllClose(tf_norm, 5.0)
        self.assertAllClose(np_ans_0, tf_ans_1)
        self.assertAllClose(np_ans_1, tf_ans_2)
Ejemplo n.º 40
0
  def testModelWithBuckets(self):
    """Larger tests that does full sequence-to-sequence model training."""
    # We learn to copy 10 symbols in 2 buckets: length 4 and length 8.
    classes = 10
    buckets = [(4, 4), (8, 8)]
    perplexities = [[], []]  # Results for each bucket.
    random_seed.set_random_seed(111)
    random.seed(111)
    np.random.seed(111)

    with self.test_session() as sess:
      # We use sampled softmax so we keep output projection separate.
      w = variable_scope.get_variable("proj_w", [24, classes])
      w_t = array_ops.transpose(w)
      b = variable_scope.get_variable("proj_b", [classes])

      # Here comes a sample Seq2Seq model using GRU cells.
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
              state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24,
              output_projection=(w, b))

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]

        def SampledLoss(labels, inputs):
          labels = array_ops.reshape(labels, [-1, 1])
          return nn_impl.sampled_softmax_loss(
              weights=w_t,
              biases=b,
              labels=labels,
              inputs=inputs,
              num_sampled=8,
              num_classes=classes)

        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            softmax_loss_function=SampledLoss)

      # Now we construct the copy model.
      batch_size = 8
      inp = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      out = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      weights = [
          array_ops.ones_like(
              inp[0], dtype=dtypes.float32) for _ in range(8)
      ]
      with variable_scope.variable_scope("root"):
        _, losses = SampleGRUSeq2Seq(inp, out, weights)
        updates = []
        params = variables.all_variables()
        optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5)
        for i in range(len(buckets)):
          full_grads = gradients_impl.gradients(losses[i], params)
          grads, _ = clip_ops.clip_by_global_norm(full_grads, 30.0)
          update = optimizer.apply_gradients(zip(grads, params))
          updates.append(update)
        sess.run([variables.global_variables_initializer()])
      steps = 6
      for _ in range(steps):
        bucket = random.choice(np.arange(len(buckets)))
        length = buckets[bucket][0]
        i = [
            np.array(
                [np.random.randint(9) + 1 for _ in range(batch_size)],
                dtype=np.int32) for _ in range(length)
        ]
        # 0 is our "GO" symbol here.
        o = [np.array([0] * batch_size, dtype=np.int32)] + i
        feed = {}
        for i1, i2, o1, o2 in zip(inp[:length], i[:length], out[:length],
                                  o[:length]):
          feed[i1.name] = i2
          feed[o1.name] = o2
        if length < 8:  # For the 4-bucket, we need the 5th as target.
          feed[out[length].name] = o[length]
        res = sess.run([updates[bucket], losses[bucket]], feed)
        perplexities[bucket].append(math.exp(float(res[1])))
      for bucket in range(len(buckets)):
        if len(perplexities[bucket]) > 1:  # Assert that perplexity went down.
          self.assertLess(perplexities[bucket][-1],  # 10% margin of error.
                          1.1 * perplexities[bucket][0])
Ejemplo n.º 41
0
def _linear_classifier_model_fn(features, targets, mode, params):
    """Estimator's linear model_fn."""
    n_classes = params["n_classes"]
    weight_column_name = params["weight_column_name"]
    feature_columns = params["feature_columns"]
    optimizer = params["optimizer"]
    gradient_clip_norm = params.get("gradient_clip_norm", None)
    enable_centered_bias = params.get("enable_centered_bias", True)
    num_ps_replicas = params.get("num_ps_replicas", 0)

    if not isinstance(features, dict):
        features = {"": features}

    num_label_columns = 1 if n_classes == 2 else n_classes
    loss_fn = _softmax_cross_entropy_loss
    if n_classes == 2:
        loss_fn = _log_loss_with_two_classes

    feat_values = features.values() if isinstance(features,
                                                  dict) else [features]
    partitioner = partitioned_variables.min_max_variable_partitioner(
        max_partitions=num_ps_replicas, min_slice_size=64 << 20)
    with variable_scope.variable_op_scope(feat_values,
                                          "linear",
                                          partitioner=partitioner) as scope:
        logits, _, _ = (layers.weighted_sum_from_feature_columns(
            columns_to_tensors=features,
            feature_columns=feature_columns,
            num_outputs=num_label_columns,
            weight_collections=["linear"],
            scope=scope))

    if enable_centered_bias:
        logits = nn.bias_add(logits, _centered_bias(num_label_columns))

    loss = None
    if mode != estimator.ModeKeys.INFER:
        loss = loss_fn(logits, targets)
        if weight_column_name:
            weight_tensor = array_ops.reshape(math_ops.to_float(
                features[weight_column_name]),
                                              shape=(-1, ))
            loss = _weighted_loss(loss, weight_tensor)
        else:
            loss = math_ops.reduce_mean(loss, name="loss")

    train_ops = []
    if mode == estimator.ModeKeys.TRAIN:
        global_step = contrib_variables.get_global_step()

        my_vars = ops.get_collection("linear")
        grads = gradients.gradients(loss, my_vars)
        if gradient_clip_norm:
            grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
        train_ops.append(
            optimizer.apply_gradients(zip(grads, my_vars),
                                      global_step=global_step))
        if enable_centered_bias:
            train_ops.append(
                _centered_bias_step(targets, loss_fn, num_label_columns))

    predictions = {}
    if n_classes == 2:
        predictions[_LOGISTIC] = math_ops.sigmoid(logits)
        logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)

    return predictions, loss, control_flow_ops.group(*train_ops)
Ejemplo n.º 42
0
def _clip_gradients_by_norm(grads_and_vars, clip_gradients):
    """Clips gradients by global norm."""
    gradients, variables = zip(*grads_and_vars)
    clipped_gradients, _ = clip_ops.clip_by_global_norm(
        gradients, clip_gradients)
    return list(zip(clipped_gradients, variables))
Ejemplo n.º 43
0
 def _finish(self, update_ops, name_scope):
   """"""
   
   caches = [update_op[0] for update_op in update_ops]
   update_ops = [update_op[1:] for update_op in update_ops]
   if self._noise is not None:
     for cache in caches:
       s_t, x_tm1 = cache[:2]
       s_t += random_ops.random_normal(x_tm1.initialized_value().get_shape(), stddev=self._noise)
       cache[0] = s_t
   
   if self._clip is not None:
     S_t = [cache[0] for cache in caches]
     S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip)
     for cache, s_t in zip(caches, S_t):
       cache[0] = s_t
   
   new_update_ops = []
   for cache, update_op in zip(caches, update_ops):
     if len(cache) == 3:
       s_t, x_tm1 = cache[:2]
       with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
         x_t = state_ops.assign_sub(x_tm1, s_t, use_locking=self._use_locking)
         cache.append(x_t)
     else:
       s_t_, x_tm1, idxs = cache[:3]
       with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
         x_t = state_ops.scatter_sub(x_tm1, idxs, s_t_, use_locking=self._use_locking)
         cache.append(x_t)
     new_update_ops.append(control_flow_ops.group(*([x_t] + update_op)))
   
   with ops.control_dependencies(new_update_ops):
     more_update_ops = []
     if self._save_step:
       for cache in caches:
         if len(cache) == 4:
           s_t, x_tm1 = cache[:2]
           s_tm1 = self.get_slot(x_tm1, 's')
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             new_step_and_grads = []
             s_t = state_ops.assign(s_tm1, -s_t, use_locking=self._use_locking)
         else:
           s_t_, x_tm1, idxs = cache[:3]
           s_tm1 = self.get_slot(x_tm1, 's')
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             s_t = state_ops.scatter_update(s_tm1, idxs, -s_t_, use_locking=self._use_locking)
         more_update_ops.append(s_t)
     if self._save_grad:
       for cache in caches:
         if len(cache) == 4:
           x_tm1, g_t = cache[1:3]
           g_tm1 = self.get_slot(x_tm1, 'g')
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             new_step_and_grads = []
             g_t = state_ops.assign(g_tm1, g_t, use_locking=self._use_locking)
         else:
           x_tm1, idxs, g_t_ = cache[1:4]
           g_tm1 = self.get_slot(x_tm1, 'g')
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             g_t = state_ops.scatter_update(g_tm1, idxs, g_t_, use_locking=self._use_locking)
         more_update_ops.append(g_t)
     
     if self._chi > 0:
       for cache in caches:
         if len(cache) == 4:
           _, x_tm1, _, x_t = cache
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             x_and_t = self._dense_moving_average(x_tm1, x_t, 'x', self._chi)
             more_update_ops.append(control_flow_ops.group(*x_and_t))
         else:
           _, x_tm1, idxs, _, x_t = cache
           with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
             x_t_ = array_ops.gather(x_t, idxs)
             x_and_t = self._sparse_moving_average(x_tm1, idxs, x_t_, 'x', self._chi)
             more_update_ops.append(control_flow_ops.group(*x_and_t))
   
   return control_flow_ops.group(*(new_update_ops + more_update_ops), name=name_scope)
Ejemplo n.º 44
0
    def _finish(self, update_ops, name_scope):
        """"""

        caches = [update_op[0] for update_op in update_ops]
        update_ops = [update_op[1:] for update_op in update_ops]
        if self._noise is not None:
            for cache in caches:
                s_t, x_tm1 = cache[:2]
                s_t += random_ops.random_normal(
                    x_tm1.initialized_value().get_shape(), stddev=self._noise)
                cache[0] = s_t

        if self._clip > 0:
            S_t = [cache[0] for cache in caches]
            S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip)
            for cache, s_t in zip(caches, S_t):
                cache[0] = s_t

        new_update_ops = []
        for cache, update_op in zip(caches, update_ops):
            if len(cache) == 3:
                s_t, x_tm1 = cache[:2]
                with ops.name_scope('update_' + x_tm1.op.name), ops.device(
                        x_tm1.device):
                    x_t = state_ops.assign_sub(x_tm1,
                                               s_t,
                                               use_locking=self._use_locking)
                    cache.append(x_t)
            else:
                s_t_, x_tm1, idxs = cache[:3]
                with ops.name_scope('update_' + x_tm1.op.name), ops.device(
                        x_tm1.device):
                    x_t = state_ops.scatter_sub(x_tm1,
                                                idxs,
                                                s_t_,
                                                use_locking=self._use_locking)
                    cache.append(x_t)
            new_update_ops.append(control_flow_ops.group(*([x_t] + update_op)))

        with ops.control_dependencies(new_update_ops):
            more_update_ops = []
            if self._save_step:
                for cache in caches:
                    if len(cache) == 4:
                        s_t, x_tm1 = cache[:2]
                        s_tm1 = self.get_slot(x_tm1, 's')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            new_step_and_grads = []
                            s_t = state_ops.assign(
                                s_tm1, -s_t, use_locking=self._use_locking)
                    else:
                        s_t_, x_tm1, idxs = cache[:3]
                        s_tm1 = self.get_slot(x_tm1, 's')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            s_t = state_ops.scatter_update(
                                s_tm1,
                                idxs,
                                -s_t_,
                                use_locking=self._use_locking)
                    more_update_ops.append(s_t)
            if self._save_grad:
                for cache in caches:
                    if len(cache) == 4:
                        x_tm1, g_t = cache[1:3]
                        g_tm1 = self.get_slot(x_tm1, 'g')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            new_step_and_grads = []
                            g_t = state_ops.assign(
                                g_tm1, g_t, use_locking=self._use_locking)
                    else:
                        x_tm1, idxs, g_t_ = cache[1:4]
                        g_tm1 = self.get_slot(x_tm1, 'g')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            g_t = state_ops.scatter_update(
                                g_tm1,
                                idxs,
                                g_t_,
                                use_locking=self._use_locking)
                    more_update_ops.append(g_t)

            if self._chi > 0:
                for cache in caches:
                    if len(cache) == 4:
                        _, x_tm1, _, x_t = cache
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            x_and_t = self._dense_moving_average(
                                x_tm1, x_t, 'x', self._chi)
                            more_update_ops.append(
                                control_flow_ops.group(*x_and_t))
                    else:
                        _, x_tm1, idxs, _, x_t = cache
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            x_t_ = array_ops.gather(x_t, idxs)
                            x_and_t = self._sparse_moving_average(
                                x_tm1, idxs, x_t_, 'x', self._chi)
                            more_update_ops.append(
                                control_flow_ops.group(*x_and_t))

        return control_flow_ops.group(*(new_update_ops + more_update_ops),
                                      name=name_scope)
def build_multi_tower_graph(images,
                            sketches,
                            images_d,
                            image_paired_class_ids,
                            image_paired_class_ids_d,
                            text_vocab_indiceses,
                            LSTM_hybrid,
                            vocab_size,
                            batch_size,
                            num_gpu,
                            batch_portion,
                            training,
                            learning_rates,
                            counter,
                            max_iter_step,
                            ld=10,
                            data_format='NCHW',
                            distance_map=True,
                            optimizer='Adam',
                            block_type='MRU'):
    """
    :param images: [batch_size, 3, H, W]
    :param sketches:  [batch_size, 3, H, W]
    :param images_d:  [batch_size, 3, H, W]
    :param image_paired_class_ids: [batch_size, ], class_number
    :param image_paired_class_ids_d: [batch_size, ]
    :param text_vocab_indiceses: [batch_size, 15]
    :return:
    """
    models.set_param(data_format=data_format)

    with tf.device('/cpu:0'):
        images_list = split_inputs(images, batch_size, batch_portion,
                                   num_gpu)  # [num_gpu, [N, C, H, W]]
        images_d_list = split_inputs(images_d, batch_size, batch_portion,
                                     num_gpu)
        sketches_list = split_inputs(sketches, batch_size, batch_portion,
                                     num_gpu)
        image_paired_class_ids_list = split_inputs(image_paired_class_ids,
                                                   batch_size, batch_portion,
                                                   num_gpu)
        image_paired_class_ids_d_list = split_inputs(image_paired_class_ids_d,
                                                     batch_size, batch_portion,
                                                     num_gpu)
        text_vocab_indiceses_list = split_inputs(text_vocab_indiceses,
                                                 batch_size, batch_portion,
                                                 num_gpu)

    lr_g = learning_rates['generator']
    lr_d = learning_rates['discriminator']
    optimizer = get_optimizer(optimizer)
    decay = tf.maximum(
        0.2, 1. - (tf.cast(counter, tf.float32) / max_iter_step * 0.9))
    tf.summary.scalar('learning_rate_g', lr_g * decay)
    optim_g = optimizer(learning_rate=lr_g * decay)
    optim_d = optimizer(learning_rate=lr_d * decay)

    tower_grads_g = []
    tower_grads_d = []
    for i in range(num_gpu):
        with tf.name_scope('%s_%d' % ('GPU', i)) as scope:
            loss_g, loss_d, grad_g, grad_d \
                = build_single_graph(images_list[i],
                                     sketches_list[i],
                                     images_d_list[i],
                                     image_paired_class_ids_list[i],
                                     image_paired_class_ids_d_list[i],
                                     text_vocab_indiceses_list[i],
                                     batch_size * batch_portion[i],
                                     training,
                                     LSTM_hybrid=LSTM_hybrid,
                                     vocab_size=vocab_size,
                                     ld=ld, data_format=data_format,
                                     distance_map=distance_map,
                                     optim_g=optim_g,
                                     optim_d=optim_d,
                                     block_type=block_type)

            tower_grads_g.append(grad_g)
            tower_grads_d.append(grad_d)

    assert len(tower_grads_g) == len(tower_grads_d)
    if len(tower_grads_d) == 1:
        ave_grad_g = grad_g
        ave_grad_d = grad_d
    else:
        ave_grad_g, ave_grad_d = average_gradients(
            (tower_grads_g, tower_grads_d))

    # Apply gradients
    tf.get_variable_scope(
    )._reuse = False  # Hack to force initialization of optimizer variables

    if Config.sn:
        # Get the update ops
        spectral_norm_update_ops = tf.get_collection(
            Config.SPECTRAL_NORM_UPDATE_OPS)
    else:
        spectral_norm_update_ops = [tf.no_op()]
        assign_ops = tf.no_op()

    # Clip gradients if using WGAN/DRAGAN
    global_grad_norm_G = None
    global_grad_norm_G_clipped = None
    global_grad_norm_D = None
    global_grad_norm_D_clipped = None

    if not Config.sn:
        max_grad_norm_G = 50.
        max_grad_norm_D = 100.
        hard_clip_norm_G = 5.
        hard_clip_norm_D = 10.

        ave_grad_g_tensors, ave_grad_g_vars = list(zip(*ave_grad_g))
        global_grad_norm_G = clip_ops.global_norm(ave_grad_g_tensors)
        ave_grad_g_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_g_tensors, max_grad_norm_G, global_grad_norm_G)
        ave_grad_g_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_G)
            for t in ave_grad_g_tensors
        ]
        ave_grad_g = list(zip(ave_grad_g_tensors, ave_grad_g_vars))

        ave_grad_d_tensors, ave_grad_d_vars = list(zip(*ave_grad_d))
        global_grad_norm_D = clip_ops.global_norm(ave_grad_d_tensors)
        ave_grad_d_tensors, _ = clip_ops.clip_by_global_norm(
            ave_grad_d_tensors, max_grad_norm_D, global_grad_norm_D)
        ave_grad_d_tensors = [
            clip_ops.clip_by_norm(t, hard_clip_norm_D)
            for t in ave_grad_d_tensors
        ]
        ave_grad_d = list(zip(ave_grad_d_tensors, ave_grad_d_vars))
    with tf.control_dependencies(spectral_norm_update_ops):
        opt_g = optimize(ave_grad_g,
                         optim_g,
                         None,
                         'gradient_norm',
                         global_norm=global_grad_norm_G,
                         global_norm_clipped=global_grad_norm_G_clipped,
                         appendix='_G')
    opt_d = optimize(ave_grad_d,
                     optim_d,
                     None,
                     'gradient_norm',
                     global_norm=global_grad_norm_D,
                     global_norm_clipped=global_grad_norm_D_clipped,
                     appendix='_D')

    summaries = gather_summaries()
    loss_g, loss_d = gather_losses()

    # Generator output from last tower
    return opt_g, opt_d, loss_g, loss_d, summaries
Ejemplo n.º 46
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
                  variables=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    clip_gradients: float or None, clips gradients by this value.
    moving_average_decay: float or None, takes into account previous loss
                          to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes learning_rate and global_step
                            Tensors, returns Tensor. Can be used to implement
                            any learning rate decay funcitons.
                            For example: tf.train.exponential_decay.
    variables: list of variables to optimizer or none.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  # Moving average of the loss with decay.
  if moving_average_decay is not None:
    # Generate moving averages of the loss.
    loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                   name="avg")
    loss_averages_op = loss_averages.apply([loss])
    logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
    loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

  # Learning rate variable, with possible decay.
  if isinstance(learning_rate, ops.Tensor) and len(learning_rate.get_shape()) == 0:
    lr = learning_rate
  elif isinstance(learning_rate, float):
    lr = vs.get_variable("learning_rate",
                         [],
                         trainable=False,
                         initializer=init_ops.constant_initializer(learning_rate))
  else:
    raise ValueError("Learning rate should be 0d Tensor or float. Got %s" %
        str(learning_rate))
  if learning_rate_decay_fn is not None:
    lr = learning_rate_decay_fn(lr, global_step)

  # Create optimizer, given specified parameters.
  if isinstance(optimizer, six.string_types):
    if optimizer not in OPTIMIZER_CLS_NAMES:
      raise ValueError("Optimizer name should be one of [%s], you provided %s."
                       % (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
    opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
  elif isinstance(optimizer, type) and issubclass(optimizer,
                                                  optimizer_.Optimizer):
    opt = optimizer(learning_rate=lr)
  elif isinstance(optimizer, optimizer_.Optimizer):
    opt = optimizer
  else:
    raise ValueError("Unrecognized optimizer: should be string, "
                     "subclass of Optimizer or instance of "
                     "subclass of Optimizer. Got %s." % str(optimizer))

  # All trainable variables, if specific variables are not specified.
  if variables is None:
    variables = vars_.trainable_variables()

  # Compute gradients and clip them if provided.
  gradients = opt.compute_gradients(loss, variables)
  if clip_gradients is not None:
    gradients, variables = zip(*gradients)
    clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                        clip_gradients)
    gradients = list(zip(clipped_gradients, variables))

  # Add scalar summary for loss.
  logging_ops.scalar_summary("loss", loss)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in gradients:
    if isinstance(gradient, ops.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient

    if grad_values is not None:
      logging_ops.histogram_summary(variable.name, variable)
      logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
      logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                    clip_ops.global_norm([grad_values]))

  # Create gradient updates.
  grad_updates = opt.apply_gradients(gradients,
                                     global_step=global_step,
                                     name="train")
  # Make sure total_loss is valid.
  final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

  # Ensure the train_tensor computes grad_updates.
  train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

  return train_tensor
Ejemplo n.º 47
0
 def clip_grads(grads_and_vars):
   gradients, variables = zip(*grads_and_vars)
   gradients = clip_ops.clip_by_global_norm(gradients, clip_norm)[0]
   grads_and_vars = list(zip(gradients, variables))
   return grads_and_vars
Ejemplo n.º 48
0
def _linear_classifier_model_fn(features, targets, mode, params):
  """Linear classifier model_fn.

  Args:
    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
    targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of
      dtype `int32` or `int64` in the range `[0, n_classes)`.
    mode: Defines whether this is training, evaluation or prediction.
      See `ModeKeys`.
    params: A dict of hyperparameters.
      The following hyperparameters are expected:
      * feature_columns: An iterable containing all the feature columns used by
          the model.
      * n_classes: number of target classes.
      * weight_column_name: A string defining the weight feature column, or
          None if there are no weights.
      * optimizer: string, `Optimizer` object, or callable that defines the
          optimizer to use for training.
      * gradient_clip_norm: A float > 0. If provided, gradients are
          clipped to their global norm with this clipping ratio.
      * enable_centered_bias: A bool. If True, estimator will learn a centered
          bias variable for each class. Rest of the model structure learns the
          residual after centered bias.
      * num_ps_replicas: The number of parameter server replicas.
      * joint_weights: If True, the weights for all columns will be stored in a
        single (possibly partitioned) variable. It's more efficient, but it's
        incompatible with SDCAOptimizer, and requires all feature columns are
        sparse and use the 'sum' combiner.

  Returns:
    predictions: A dict of `Tensor` objects.
    loss: A scalar containing the loss of the step.
    train_op: The op for training.

  Raises:
    ValueError: If mode is not any of the `ModeKeys`.
  """
  feature_columns = params["feature_columns"]
  n_classes = params["n_classes"]
  weight_column_name = params["weight_column_name"]
  optimizer = params["optimizer"]
  gradient_clip_norm = params.get("gradient_clip_norm", None)
  enable_centered_bias = params.get("enable_centered_bias", True)
  num_ps_replicas = params.get("num_ps_replicas", 0)
  joint_weights = params.get("joint_weights", False)

  if not isinstance(features, dict):
    features = {"": features}

  parent_scope = "linear"
  num_label_columns = 1 if n_classes == 2 else n_classes
  loss_fn = _softmax_cross_entropy_loss
  if n_classes == 2:
    loss_fn = _log_loss_with_two_classes

  partitioner = partitioned_variables.min_max_variable_partitioner(
      max_partitions=num_ps_replicas,
      min_slice_size=64 << 20)
  with variable_scope.variable_op_scope(
      features.values(), parent_scope, partitioner=partitioner) as scope:
    if joint_weights:
      logits, _, _ = (
          layers.joint_weighted_sum_from_feature_columns(
              columns_to_tensors=features,
              feature_columns=feature_columns,
              num_outputs=num_label_columns,
              weight_collections=[parent_scope],
              scope=scope))
    else:
      logits, _, _ = (
          layers.weighted_sum_from_feature_columns(
              columns_to_tensors=features,
              feature_columns=feature_columns,
              num_outputs=num_label_columns,
              weight_collections=[parent_scope],
              scope=scope))

  if enable_centered_bias:
    logits = nn.bias_add(logits, _centered_bias(num_label_columns))

  loss = None
  if mode != estimator.ModeKeys.INFER:
    loss = loss_fn(logits, targets)
    if weight_column_name:
      weight_tensor = array_ops.reshape(
          math_ops.to_float(features[weight_column_name]), shape=(-1,))
      loss = _weighted_loss(loss, weight_tensor)
    else:
      loss = math_ops.reduce_mean(loss, name="loss")
    logging_ops.scalar_summary("loss", loss)

  train_ops = []
  if mode == estimator.ModeKeys.TRAIN:
    global_step = contrib_variables.get_global_step()

    my_vars = ops.get_collection("linear")
    grads = gradients.gradients(loss, my_vars)
    if gradient_clip_norm:
      grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
    train_ops.append(optimizer.apply_gradients(
        zip(grads, my_vars), global_step=global_step))
    if enable_centered_bias:
      train_ops.append(
          _centered_bias_step(targets, loss_fn, num_label_columns))

  predictions = {}
  if n_classes == 2:
    predictions[_LOGISTIC] = math_ops.sigmoid(logits)
    logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
  predictions[_PROBABILITIES] = nn.softmax(logits)
  predictions[_CLASSES] = math_ops.argmax(logits, 1)

  return predictions, loss, control_flow_ops.group(*train_ops)
Ejemplo n.º 49
0
def _linear_classifier_model_fn(features, targets, mode, params):
  """Estimator's linear model_fn."""
  n_classes = params["n_classes"]
  weight_column_name = params["weight_column_name"]
  feature_columns = params["feature_columns"]
  optimizer = params["optimizer"]
  gradient_clip_norm = params.get("gradient_clip_norm", None)
  enable_centered_bias = params.get("enable_centered_bias", True)
  num_ps_replicas = params.get("num_ps_replicas", 0)
  joint_weights = params.get("joint_weights", False)

  if not isinstance(features, dict):
    features = {"": features}

  num_label_columns = 1 if n_classes == 2 else n_classes
  loss_fn = _softmax_cross_entropy_loss
  if n_classes == 2:
    loss_fn = _log_loss_with_two_classes

  feat_values = (features.values() if isinstance(features, dict)
                 else [features])
  partitioner = partitioned_variables.min_max_variable_partitioner(
      max_partitions=num_ps_replicas,
      min_slice_size=64 << 20)
  with variable_scope.variable_op_scope(
      feat_values, "linear", partitioner=partitioner) as scope:
    if joint_weights:
      logits, _, _ = (
          layers.joint_weighted_sum_from_feature_columns(
              columns_to_tensors=features,
              feature_columns=feature_columns,
              num_outputs=num_label_columns,
              weight_collections=["linear"],
              scope=scope))
    else:
      logits, _, _ = (
          layers.weighted_sum_from_feature_columns(
              columns_to_tensors=features,
              feature_columns=feature_columns,
              num_outputs=num_label_columns,
              weight_collections=["linear"],
              scope=scope))

  if enable_centered_bias:
    logits = nn.bias_add(logits, _centered_bias(num_label_columns))

  loss = None
  if mode != estimator.ModeKeys.INFER:
    loss = loss_fn(logits, targets)
    if weight_column_name:
      weight_tensor = array_ops.reshape(
          math_ops.to_float(features[weight_column_name]), shape=(-1,))
      loss = _weighted_loss(loss, weight_tensor)
    else:
      loss = math_ops.reduce_mean(loss, name="loss")
    logging_ops.scalar_summary("loss", loss)

  train_ops = []
  if mode == estimator.ModeKeys.TRAIN:
    global_step = contrib_variables.get_global_step()

    my_vars = ops.get_collection("linear")
    grads = gradients.gradients(loss, my_vars)
    if gradient_clip_norm:
      grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
    train_ops.append(optimizer.apply_gradients(
        zip(grads, my_vars), global_step=global_step))
    if enable_centered_bias:
      train_ops.append(
          _centered_bias_step(targets, loss_fn, num_label_columns))

  predictions = {}
  if n_classes == 2:
    predictions[_LOGISTIC] = math_ops.sigmoid(logits)
    logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
  predictions[_PROBABILITIES] = nn.softmax(logits)
  predictions[_CLASSES] = math_ops.argmax(logits, 1)

  return predictions, loss, control_flow_ops.group(*train_ops)
Ejemplo n.º 50
0
 def _testNonFiniteClippingByGlobalNorm(self, inputs, max_norm):
     clipped = clip_ops.clip_by_global_norm(inputs, max_norm)
     result, _ = self.evaluate(clipped)
     self.assertTrue(np.all(np.isnan(result)))
Ejemplo n.º 51
0
def _clip_gradients_by_norm(grads_and_vars, clip_gradients):
  """Clips gradients by global norm."""
  gradients, variables = zip(*grads_and_vars)
  clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                      clip_gradients)
  return list(zip(clipped_gradients, variables))
Ejemplo n.º 52
0
 def _testClipTensorByGlobalNorm(self, inputs, max_norm, expected):
     clipped = clip_ops.clip_by_global_norm(inputs, max_norm)
     result, _ = self.evaluate(clipped)
     self.assertAllClose(result, expected)
Ejemplo n.º 53
0
def _linear_classifier_model_fn(features, targets, mode, params):
    """Linear classifier model_fn.

  Args:
    features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`).
    targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of
      dtype `int32` or `int64` in the range `[0, n_classes)`.
    mode: Defines whether this is training, evaluation or prediction.
      See `ModeKeys`.
    params: A dict of hyperparameters.
      The following hyperparameters are expected:
      * feature_columns: An iterable containing all the feature columns used by
          the model.
      * n_classes: number of target classes.
      * weight_column_name: A string defining the weight feature column, or
          None if there are no weights.
      * optimizer: string, `Optimizer` object, or callable that defines the
          optimizer to use for training.
      * gradient_clip_norm: A float > 0. If provided, gradients are
          clipped to their global norm with this clipping ratio.
      * enable_centered_bias: A bool. If True, estimator will learn a centered
          bias variable for each class. Rest of the model structure learns the
          residual after centered bias.
      * num_ps_replicas: The number of parameter server replicas.
      * joint_weights: If True, the weights for all columns will be stored in a
        single (possibly partitioned) variable. It's more efficient, but it's
        incompatible with SDCAOptimizer, and requires all feature columns are
        sparse and use the 'sum' combiner.

  Returns:
    predictions: A dict of `Tensor` objects.
    loss: A scalar containing the loss of the step.
    train_op: The op for training.

  Raises:
    ValueError: If mode is not any of the `ModeKeys`.
  """
    feature_columns = params["feature_columns"]
    n_classes = params["n_classes"]
    weight_column_name = params["weight_column_name"]
    optimizer = params["optimizer"]
    gradient_clip_norm = params.get("gradient_clip_norm", None)
    enable_centered_bias = params.get("enable_centered_bias", True)
    num_ps_replicas = params.get("num_ps_replicas", 0)
    joint_weights = params.get("joint_weights", False)

    if not isinstance(features, dict):
        features = {"": features}

    parent_scope = "linear"
    num_label_columns = 1 if n_classes == 2 else n_classes
    loss_fn = _softmax_cross_entropy_loss
    if n_classes == 2:
        loss_fn = _log_loss_with_two_classes

    partitioner = partitioned_variables.min_max_variable_partitioner(
        max_partitions=num_ps_replicas, min_slice_size=64 << 20)
    with variable_scope.variable_op_scope(features.values(),
                                          parent_scope,
                                          partitioner=partitioner) as scope:
        if joint_weights:
            logits, _, _ = (layers.joint_weighted_sum_from_feature_columns(
                columns_to_tensors=features,
                feature_columns=feature_columns,
                num_outputs=num_label_columns,
                weight_collections=[parent_scope],
                scope=scope))
        else:
            logits, _, _ = (layers.weighted_sum_from_feature_columns(
                columns_to_tensors=features,
                feature_columns=feature_columns,
                num_outputs=num_label_columns,
                weight_collections=[parent_scope],
                scope=scope))

    if enable_centered_bias:
        logits = nn.bias_add(logits, _centered_bias(num_label_columns))

    loss = None
    if mode != estimator.ModeKeys.INFER:
        loss = loss_fn(logits, targets)
        if weight_column_name:
            weight_tensor = array_ops.reshape(math_ops.to_float(
                features[weight_column_name]),
                                              shape=(-1, ))
            loss = _weighted_loss(loss, weight_tensor)
        else:
            loss = math_ops.reduce_mean(loss, name="loss")
        logging_ops.scalar_summary("loss", loss)

    train_ops = []
    if mode == estimator.ModeKeys.TRAIN:
        global_step = contrib_variables.get_global_step()

        my_vars = ops.get_collection("linear")
        grads = gradients.gradients(loss, my_vars)
        if gradient_clip_norm:
            grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm)
        train_ops.append(
            optimizer.apply_gradients(zip(grads, my_vars),
                                      global_step=global_step))
        if enable_centered_bias:
            train_ops.append(
                _centered_bias_step(targets, loss_fn, num_label_columns))

    predictions = {}
    if n_classes == 2:
        predictions[_LOGISTIC] = math_ops.sigmoid(logits)
        logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
    predictions[_PROBABILITIES] = nn.softmax(logits)
    predictions[_CLASSES] = math_ops.argmax(logits, 1)

    return predictions, loss, control_flow_ops.group(*train_ops)
Ejemplo n.º 54
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  clip_gradients=None,
                  moving_average_decay=0.9,
                  learning_rate_decay_fn=None,
                  variables=None):
  """Given loss and parameters for optimizer, returns a training op.

  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string or function, used as optimizer for training.
    clip_gradients: float or None, clips gradients by this value.
    moving_average_decay: float or None, takes into account previous loss
                          to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes learning_rate and global_step
                            Tensors, returns Tensor. Can be used to implement
                            any learning rate decay funcitons.
                            For example: tf.train.exponential_decay.
    variables: list of variables to optimizer or none.

  Returns:
    Training op.

  Raises:
    ValueError: if optimizer is wrong type.
  """
  # Moving average of the loss with decay.
  if moving_average_decay is not None:
    # Generate moving averages of the loss.
    loss_averages = train.ExponentialMovingAverage(moving_average_decay,
                                                   name="avg")
    loss_averages_op = loss_averages.apply([loss])
    logging_ops.scalar_summary("loss/mean", loss_averages.average(loss))
    loss = control_flow_ops.with_dependencies([loss_averages_op], loss)

  # Convert optimizer into the optimizer class.
  if isinstance(optimizer, str):
    opt_cls = OPTIMIZER_CLS_NAMES[optimizer]
  elif callable(optimizer):
    opt_cls = optimizer
  else:
    raise ValueError("Unrecognized optimizer: should be string or function.")

  # Learning rate variable, with possible decay.
  lr = vs.get_variable("learning_rate",
                       [],
                       trainable=False,
                       initializer=init_ops.constant_initializer(learning_rate))
  if learning_rate_decay_fn is not None:
    lr = learning_rate_decay_fn(lr, global_step)

  # Create optimizer.
  opt = opt_cls(learning_rate=lr)

  # All trainable variables, if specific variables are not specified.
  if variables is None:
    variables = vars_.trainable_variables()

  # Compute gradients and clip them if provided.
  gradients = opt.compute_gradients(loss, variables)
  if clip_gradients is not None:
    clipped_gradients, _ = clip_ops.clip_by_global_norm(gradients,
                                                        clip_gradients)
    gradients = zip(clipped_gradients, variables)

  # Add scalar summary for loss.
  logging_ops.scalar_summary("loss", loss)

  # Add histograms for variables, gradients and gradient norms.
  for gradient, variable in gradients:
    if isinstance(gradient, ops.IndexedSlices):
      grad_values = gradient.values
    else:
      grad_values = gradient
    logging_ops.histogram_summary(variable.name, variable)
    logging_ops.histogram_summary(variable.name + "/gradients", grad_values)
    logging_ops.histogram_summary(variable.name + "/gradient_norm",
                                  clip_ops.global_norm([grad_values]))

  # Create gradient updates.
  grad_updates = opt.apply_gradients(gradients,
                                     global_step=global_step,
                                     name="train")
  # Make sure total_loss is valid.
  final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

  # Ensure the train_tensor computes grad_updates.
  train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

  return train_tensor