Example #1
0
  def linear_logit_fn(features):
    """Linear model logit_fn.

    Args:
      features: This is the first item returned from the `input_fn`
                passed to `train`, `evaluate`, and `predict`. This should be a
                single `Tensor` or `dict` of same.

    Returns:
      A `Tensor` representing the logits.
    """
    cols_to_vars = {}
    logits = feature_column_lib.linear_model(
        features=features,
        feature_columns=feature_columns,
        units=units,
        sparse_combiner=sparse_combiner,
        cols_to_vars=cols_to_vars)
    bias = cols_to_vars.pop('bias')
    if units > 1:
      summary.histogram('bias', bias)
    else:
      # If units == 1, the bias value is a length-1 list of a scalar Tensor,
      # so we should provide a scalar summary.
      summary.scalar('bias', bias[0][0])
    summary.scalar('fraction_of_zero_weights',
                   _compute_fraction_of_zero(cols_to_vars))
    return logits
Example #2
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:

        for weight in layer.weights:
          tf_summary.histogram(weight.name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = w_img.get_shape()
            if len(shape) > 1 and shape[0] > shape[1]:
              w_img = array_ops.transpose(w_img)
            if len(shape) == 1:
              w_img = array_ops.expand_dims(w_img, 0)
            w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1)
            tf_summary.image(weight.name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)
Example #3
0
def add_gradients_summaries(grads_and_vars):
  """Add summaries to gradients.

  Args:
    grads_and_vars: A list of gradient to variable pairs (tuples).

  Returns:
    The list of created summaries.
  """
  summaries = []
  for grad, var in grads_and_vars:
    if grad is not None:
      if isinstance(grad, ops.IndexedSlices):
        grad_values = grad.values
      else:
        grad_values = grad
      summaries.append(
          summary.histogram(var.op.name + '/gradient', grad_values))
      summaries.append(
          summary.histogram(var.op.name + '/gradient_norm',
                            clip_ops.global_norm([grad_values])))
    else:
      logging.info('Var %s has no gradient', var.op.name)

  return summaries
Example #4
0
def add_gan_model_summaries(gan_model):
  """Adds typical GANModel summaries.

  Args:
    gan_model: A GANModel tuple.
  """
  with ops.name_scope('generator_variables'):
    for var in gan_model.generator_variables:
      summary.histogram(var.name, var)
  with ops.name_scope('discriminator_variables'):
    for var in gan_model.discriminator_variables:
      summary.histogram(var.name, var)
Example #5
0
  def linear_logit_fn(features):
    """Linear model logit_fn.

    Args:
      features: This is the first item returned from the `input_fn`
                passed to `train`, `evaluate`, and `predict`. This should be a
                single `Tensor` or `dict` of same.

    Returns:
      A `Tensor` representing the logits.
    """
    if feature_column_v2.is_feature_column_v2(feature_columns):
      shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
      linear_model = feature_column_v2.LinearModel(
          feature_columns=feature_columns,
          units=units,
          sparse_combiner=sparse_combiner,
          shared_state_manager=shared_state_manager)
      logits = linear_model(features)
      bias = linear_model.bias_variable

      # We'd like to get all the non-bias variables associated with this
      # LinearModel. This includes the shared embedding variables as well.
      variables = linear_model.variables
      variables.remove(bias)
      variables.extend(shared_state_manager.variables)

      # Expand (potential) Partitioned variables
      bias = _get_expanded_variable_list([bias])
      variables = _get_expanded_variable_list(variables)
    else:
      linear_model = feature_column._LinearModel(  # pylint: disable=protected-access
          feature_columns=feature_columns,
          units=units,
          sparse_combiner=sparse_combiner,
          name='linear_model')
      logits = linear_model(features)
      cols_to_vars = linear_model.cols_to_vars()
      bias = cols_to_vars.pop('bias')
      variables = cols_to_vars.values()

    if units > 1:
      summary.histogram('bias', bias)
    else:
      # If units == 1, the bias value is a length-1 list of a scalar Tensor,
      # so we should provide a scalar summary.
      summary.scalar('bias', bias[0][0])
    summary.scalar('fraction_of_zero_weights',
                   _compute_fraction_of_zero(variables))
    return logits
 def testMergeSummary(self):
   with self.cached_session() as sess:
     const = constant_op.constant(10.0)
     summ1 = summary.histogram("h", const)
     summ2 = logging_ops.scalar_summary("c", const)
     merge = summary.merge([summ1, summ2])
     value = sess.run(merge)
   self.assertEqual([], merge.get_shape())
   self.assertProtoEquals("""
     value {
       tag: "h"
       histo {
         min: 10.0
         max: 10.0
         num: 1.0
         sum: 10.0
         sum_squares: 100.0
         bucket_limit: 9.93809490288
         bucket_limit: 10.9319043932
         bucket_limit: 1.7976931348623157e+308
         bucket: 0.0
         bucket: 1.0
         bucket: 0.0
       }
     }
     value { tag: "c" simple_value: 10.0 }
   """, self._AsSummary(value))
Example #7
0
  def rnn_logit_fn(features, mode):
    """Recurrent Neural Network logit_fn.

    Args:
      features: This is the first item returned from the `input_fn`
                passed to `train`, `evaluate`, and `predict`. This should be a
                single `Tensor` or `dict` of same.
      mode: Optional. Specifies if this training, evaluation or prediction. See
            `ModeKeys`.

    Returns:
      A `Tensor` representing the logits.
    """
    with variable_scope.variable_scope(
        'sequence_input_layer',
        values=tuple(six.itervalues(features)),
        partitioner=input_layer_partitioner):
      sequence_input, sequence_length = seq_fc.sequence_input_layer(
          features=features, feature_columns=sequence_feature_columns)
      summary.histogram('sequence_length', sequence_length)

      if context_feature_columns:
        context_input = feature_column_lib.input_layer(
            features=features,
            feature_columns=context_feature_columns)
        sequence_input = seq_fc.concatenate_context_input(
            context_input, sequence_input)

    cell = rnn_cell_fn(mode)
    # Ignore output state.
    rnn_outputs, _ = rnn.dynamic_rnn(
        cell=cell,
        inputs=sequence_input,
        sequence_length=sequence_length,
        dtype=dtypes.float32,
        time_major=False)
    last_activations = _select_last_activations(rnn_outputs, sequence_length)

    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
      logits = core_layers.dense(
          last_activations,
          units=output_units,
          activation=None,
          kernel_initializer=init_ops.glorot_uniform_initializer())
    return logits
Example #8
0
  def _make_histogram_ops(self, model):
    """Defines histogram ops when histogram_freq > 0."""
    # only make histogram summary op if it hasn't already been made
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [
                grad.values if is_indexed_slices(grad) else grad
                for grad in grads
            ]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          if isinstance(layer.output, list):
            for i, output in enumerate(layer.output):
              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
          else:
            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
Example #9
0
def add_gan_model_summaries(gan_model):
  """Adds typical GANModel summaries.

  Args:
    gan_model: A GANModel tuple.
  """
  if isinstance(gan_model, namedtuples.CycleGANModel):
    with ops.name_scope('cyclegan_x2y_summaries'):
      add_gan_model_summaries(gan_model.model_x2y)
    with ops.name_scope('cyclegan_y2x_summaries'):
      add_gan_model_summaries(gan_model.model_y2x)
    return

  with ops.name_scope('generator_variables'):
    for var in gan_model.generator_variables:
      summary.histogram(var.name, var)
  with ops.name_scope('discriminator_variables'):
    for var in gan_model.discriminator_variables:
      summary.histogram(var.name, var)
Example #10
0
 def testHistogramSummary(self):
   with self.cached_session() as s:
     i = array_ops.ones((5, 4, 4, 3))
     with ops.name_scope('outer'):
       summ_op = summary_lib.histogram('inner', i)
     summary_str = s.run(summ_op)
   summary = summary_pb2.Summary()
   summary.ParseFromString(summary_str)
   self.assertEqual(len(summary.value), 1)
   self.assertEqual(summary.value[0].tag, 'outer/inner')
Example #11
0
def linear_regression(x, y, init_mean=None, init_stddev=1.0):
  """Creates linear regression TensorFlow subgraph.

  Args:
    x: tensor or placeholder for input features.
    y: tensor or placeholder for labels.
    init_mean: the mean value to use for initialization.
    init_stddev: the standard devation to use for initialization.

  Returns:
    Predictions and loss tensors.

  Side effects:
    The variables linear_regression.weights and linear_regression.bias are
    initialized as follows.  If init_mean is not None, then initialization
    will be done using a random normal initializer with the given init_mean
    and init_stddv.  (These may be set to 0.0 each if a zero initialization
    is desirable for convex use cases.)  If init_mean is None, then the
    uniform_unit_scaling_initialzer will be used.
  """
  with vs.variable_scope('linear_regression'):
    scope_name = vs.get_variable_scope().name
    summary.histogram('%s.x' % scope_name, x)
    summary.histogram('%s.y' % scope_name, y)
    dtype = x.dtype.base_dtype
    y_shape = y.get_shape()
    if len(y_shape) == 1:
      output_shape = 1
    else:
      output_shape = y_shape[1]
    # Set up the requested initialization.
    if init_mean is None:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], output_shape], dtype=dtype)
      bias = vs.get_variable('bias', [output_shape], dtype=dtype)
    else:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], output_shape],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
      bias = vs.get_variable(
          'bias', [output_shape],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
    summary.histogram('%s.weights' % scope_name, weights)
    summary.histogram('%s.bias' % scope_name, bias)
    return losses_ops.mean_squared_error_regressor(x, y, weights, bias)
Example #12
0
def add_histogram_summary(tensor, name=None, prefix=None):
  """Adds a histogram summary for the given tensor.

  Args:
    tensor: A variable or op tensor.
    name: The optional name for the summary.
    prefix: An optional prefix for the summary names.

  Returns:
    A scalar `Tensor` of type `string` whose contents are the serialized
    `Summary` protocol buffer.
  """
  return summary.histogram(
      _get_summary_name(tensor, name, prefix), tensor)
  def generate_run(self, run_name):
    if run_name == self._RUN_WITH_HISTOGRAM:
      (use_histogram, use_scalars) = (True, False)
    elif run_name == self._RUN_WITH_SCALARS:
      (use_histogram, use_scalars) = (False, True)
    else:
      assert False, 'Invalid run name: %r' % run_name
    ops.reset_default_graph()
    sess = session.Session()
    placeholder = array_ops.placeholder(dtypes.float32, shape=[3])
    if use_histogram:
      summary.histogram(self._HISTOGRAM_TAG, placeholder)
    if use_scalars:
      summary.scalar(self._SCALAR_TAG, math_ops.reduce_mean(placeholder))
    summ = summary.merge_all()

    subdir = os.path.join(self.logdir, run_name)
    writer = summary.FileWriter(subdir)
    writer.add_graph(sess.graph)
    for step in xrange(self._STEPS):
      feed_dict = {placeholder: [1 + step, 2 + step, 3 + step]}
      s = sess.run(summ, feed_dict=feed_dict)
      writer.add_summary(s, global_step=step)
    writer.close()
Example #14
0
def _add_histogram_summary(tensor, tag=None):
  """Add a summary operation for the histogram of a tensor.

  Args:
    tensor: The tensor to summarize.
    tag: The tag to use, if None then use tensor's op's name.

  Returns:
    The created histogram summary.

  Raises:
    ValueError: If the tag is already in use.
  """
  tag = tag or '%s_summary' % tensor.op.name
  return summary.histogram(tag, tensor)
Example #15
0
  def testClassicSummaryOpsErrorOut(self):
    x = constant_op.constant(42)
    x_summary = summary.scalar('x', x)
    y = constant_op.constant([1, 3, 3, 7])
    y_summary = summary.histogram('hist', y)

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge([x_summary, y_summary])

    with self.assertRaisesRegexp(
        RuntimeError,
        r'Merging tf\.summary\.\* ops is not compatible with eager execution'):
      summary.merge_all()
 def testMergeAllSummaries(self):
   with ops.Graph().as_default():
     const = constant_op.constant(10.0)
     summ1 = summary.histogram("h", const)
     summ2 = summary.scalar("o", const, collections=["foo_key"])
     summ3 = summary.scalar("c", const)
     merge = summary.merge_all()
     self.assertEqual("MergeSummary", merge.op.type)
     self.assertEqual(2, len(merge.op.inputs))
     self.assertEqual(summ1, merge.op.inputs[0])
     self.assertEqual(summ3, merge.op.inputs[1])
     merge = summary.merge_all("foo_key")
     self.assertEqual("MergeSummary", merge.op.type)
     self.assertEqual(1, len(merge.op.inputs))
     self.assertEqual(summ2, merge.op.inputs[0])
     self.assertTrue(summary.merge_all("bar_key") is None)
Example #17
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)
Example #18
0
    def set_model(self, model):
        self.model = model
        self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:

                for weight in layer.weights:
                    tf_summary.histogram(weight.name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = w_img.get_shape()
                        if len(shape) > 1 and shape[0] > shape[1]:
                            w_img = array_ops.transpose(w_img)
                        if len(shape) == 1:
                            w_img = array_ops.expand_dims(w_img, 0)
                        w_img = array_ops.expand_dims(
                            array_ops.expand_dims(w_img, 0), -1)
                        tf_summary.image(weight.name, w_img)

                if hasattr(layer, 'output'):
                    tf_summary.histogram('{}_out'.format(layer.name),
                                         layer.output)
        self.merged = tf_summary.merge_all()

        if self.write_graph:
            self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
        else:
            self.writer = tf_summary.FileWriter(self.log_dir)

        if self.embeddings_freq:
            self.saver = saver_lib.Saver()

            embeddings_layer_names = self.embeddings_layer_names

            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            embeddings = {
                layer.name: layer.weights[0]
                for layer in self.model.layers
                if layer.name in embeddings_layer_names
            }

            embeddings_metadata = {}

            if not isinstance(self.embeddings_metadata, str):
                embeddings_metadata = self.embeddings_metadata
            else:
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings.keys()
                }

            config = projector.ProjectorConfig()
            self.embeddings_logs = []

            for layer_name, tensor in embeddings.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                self.embeddings_logs.append(
                    os.path.join(self.log_dir, layer_name + '.ckpt'))

                if layer_name in embeddings_metadata:
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Example #19
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          tf_summary.histogram(weight.name, weight)
          if self.write_grads:
            grads = model.optimizer.get_gradients(model.total_loss, weight)
            tf_summary.histogram('{}_grad'.format(weight.name), grads)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(weight.name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)

    if self.embeddings_freq:
      embeddings_layer_names = self.embeddings_layer_names

      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      embeddings = {
          layer.name: layer.weights[0]
          for layer in self.model.layers if layer.name in embeddings_layer_names
      }

      self.saver = saver_lib.Saver(list(embeddings.values()))

      embeddings_metadata = {}

      if not isinstance(self.embeddings_metadata, str):
        embeddings_metadata = self.embeddings_metadata
      else:
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings.keys()
        }

      config = projector.ProjectorConfig()
      self.embeddings_ckpt_path = os.path.join(self.log_dir,
                                               'keras_embedding.ckpt')

      for layer_name, tensor in embeddings.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if layer_name in embeddings_metadata:
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
Example #20
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True):
    """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.

  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
    loss = ops.convert_to_tensor(loss)
    contrib_framework.assert_scalar(loss)
    if global_step is None:
        global_step = contrib_framework.get_global_step()
    else:
        contrib_framework.assert_global_step(global_step)
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if (isinstance(learning_rate, ops.Tensor)
                    and learning_rate.get_shape().ndims == 0):
                lr = learning_rate
            elif isinstance(learning_rate, float):
                if learning_rate < 0.0:
                    raise ValueError("Invalid learning_rate %s.",
                                     learning_rate)
                lr = vs.get_variable(
                    "learning_rate", [],
                    trainable=False,
                    initializer=init_ops.constant_initializer(learning_rate))
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" %
                    (str(learning_rate), str(type(learning_rate))))
        if summaries is None:
            summaries = ["loss", "learning_rate", "global_gradient_norm"]
        else:
            for summ in summaries:
                if summ not in OPTIMIZER_SUMMARIES:
                    raise ValueError(
                        "Summaries should be one of [%s], you provided %s." %
                        (", ".join(OPTIMIZER_SUMMARIES), summ))
        if learning_rate is not None and learning_rate_decay_fn is not None:
            if global_step is None:
                raise ValueError(
                    "global_step is required for learning_rate_decay_fn.")
            lr = learning_rate_decay_fn(lr, global_step)
            if "learning_rate" in summaries:
                summary.scalar("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr)
            else:
                opt = optimizer()
            if not isinstance(opt, optimizer_.Optimizer):
                raise ValueError(
                    "Unrecognized optimizer: function should return "
                    "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))

        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        # Compute gradients.
        gradients = opt.compute_gradients(
            loss,
            variables,
            colocate_gradients_with_ops=colocate_gradients_with_ops)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients,
                                                       gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)
            if not gradients:
                raise ValueError(
                    "Empty list of (gradient, var) pairs encountered. This is most "
                    "likely to be caused by an improper value of gradient_multipliers."
                )

        if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
            summary.scalar("global_norm/gradient_norm",
                           clip_ops.global_norm(list(zip(*gradients))[0]))

        # Optionally clip gradients by global norm.
        if isinstance(clip_gradients, float):
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)
        elif callable(clip_gradients):
            gradients = clip_gradients(gradients)
        elif clip_gradients is not None:
            raise ValueError("Unknown type %s for clip_gradients" %
                             type(clip_gradients))

        # Add scalar summary for loss.
        if "loss" in summaries:
            summary.scalar("loss", loss)

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if grad_values is not None:
                var_name = variable.name.replace(":", "_")
                if "gradients" in summaries:
                    summary.histogram("gradients/%s" % var_name, grad_values)
                if "gradient_norm" in summaries:
                    summary.scalar("gradient_norm/%s" % var_name,
                                   clip_ops.global_norm([grad_values]))

        if clip_gradients is not None and ("global_gradient_norm" in summaries
                                           or "gradient_norm" in summaries):
            summary.scalar("global_norm/clipped_gradient_norm",
                           clip_ops.global_norm(list(zip(*gradients))[0]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor
Example #21
0
def optimize_loss(loss,
                  learning_rate,
                  optimizer,
                  optimizer_params,
                  global_step=None,
                  dtype=tf.float32,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  LARC_nu=None,
                  LARC_mode='clip',
                  loss_scale=1.0,
                  automatic_loss_scaling=None,
                  on_horovod=False):
    """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARC_mode: 'scale' or 'clip'
    LARC_nu: If not None, LARC re-scaling will be
             applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
    automatic_loss_scaling: if not None, use the corresponding automatic
                            loss scaling algorithm. Must be one of 'Backoff'
                            of 'LogMax'. `dtype` must be "mixed" to use ALS.
  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
    loss = ops.convert_to_tensor(loss)
    contrib_framework.assert_scalar(loss)
    if global_step is None:
        global_step = tf.train.get_or_create_global_step()
    else:
        tf.train.assert_global_step(global_step)
    with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
        # Make sure update ops are ran before computing loss.
        if update_ops:
            loss = control_flow_ops.with_dependencies(list(update_ops), loss)

        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if isinstance(learning_rate, ops.Tensor) and \
               learning_rate.get_shape().ndims == 0:
                lr = learning_rate
            elif isinstance(learning_rate, float):
                if learning_rate < 0.0:
                    raise ValueError("Invalid learning_rate %s.",
                                     learning_rate)
                lr = vs.get_variable(
                    "learning_rate", [],
                    trainable=False,
                    initializer=init_ops.constant_initializer(learning_rate))
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" %
                    (str(learning_rate), str(type(learning_rate))))

        if summaries is None:
            summaries = ["learning_rate", "global_gradient_norm"]
        else:
            for summ in summaries:
                if summ not in OPTIMIZER_SUMMARIES:
                    raise ValueError(
                        "Summaries should be one of [%s], you provided %s." %
                        (", ".join(OPTIMIZER_SUMMARIES), summ))
        if learning_rate is not None and learning_rate_decay_fn is not None:
            if global_step is None:
                raise ValueError(
                    "global_step is required for learning_rate_decay_fn.")
            lr = learning_rate_decay_fn(lr, global_step)
            if "learning_rate" in summaries:
                summary.scalar("learning_rate", lr)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr,
                                                 **optimizer_params)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr, **optimizer_params)
        elif isinstance(optimizer, optimizer_.Optimizer):
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr, **optimizer_params)
            else:
                opt = optimizer(**optimizer_params)
            if not isinstance(opt, optimizer_.Optimizer):
                raise ValueError(
                    "Unrecognized optimizer: function should return "
                    "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))
        if on_horovod:
            import horovod.tensorflow as hvd
            opt = hvd.DistributedOptimizer(opt)
        # All trainable variables, if specific variables are not specified.
        if variables is None:
            variables = vars_.trainable_variables()

        if automatic_loss_scaling is not None:
            if not automatic_loss_scaling in AutomaticLossScaler.SUPPORTED_ALGOS:
                raise ValueError(
                    "Unknown automatic loss scaling algorithm: %s." %
                    automatic_loss_sclaing)
            if dtype != "mixed":
                raise ValueError(
                    "Automatic loss scaling can be used only with "
                    "dtype=mixed.")
            loss_scaler = AutomaticLossScaler(algorithm=automatic_loss_scaling)
        else:
            loss_scaler = None

        if dtype == 'mixed':
            opt = MixedPrecisionOptimizerWrapper(
                opt,
                automatic_loss_scaler=loss_scaler,
            )

        # Compute gradients.
        gradients = opt.compute_gradients(
            loss if loss_scale == 1.0 else loss * loss_scale,
            variables,
            colocate_gradients_with_ops=colocate_gradients_with_ops)

        if loss_scale != 1.0:
            gradients = _multiply_gradients_const(gradients, 1.0 / loss_scale)

        # Optionally add gradient noise.
        if gradient_noise_scale is not None:
            gradients = _add_scaled_noise_to_gradients(gradients,
                                                       gradient_noise_scale)

        # Multiply some gradients.
        if gradient_multipliers is not None:
            gradients = _multiply_gradients(gradients, gradient_multipliers)
            if not gradients:
                raise ValueError(
                    "Empty list of (gradient, var) pairs encountered. This is most "
                    "likely to be caused by an improper value of gradient_multipliers."
                )

        if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
            summary.scalar(
                "global_norm/gradient_norm",
                clip_ops.global_norm(
                    list(
                        map(lambda x: tf.cast(x, tf.float32),
                            list(zip(*gradients))[0]))),
            )

        # Optionally clip gradients by global norm.
        if clip_gradients is not None and LARC_nu is not None:
            raise AttributeError(
                "LARC and gradient norm clipping should not be used together")
        if isinstance(clip_gradients, float):
            gradients = _clip_gradients_by_norm(gradients, clip_gradients)
        elif callable(clip_gradients):
            gradients = clip_gradients(gradients)
        elif clip_gradients is not None:
            raise ValueError("Unknown type %s for clip_gradients" %
                             type(clip_gradients))

        # Add histograms for variables, gradients and gradient norms.
        for gradient, variable in gradients:
            if isinstance(gradient, ops.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if isinstance(variable, ops.IndexedSlices):
                var_values = variable.values
            else:
                var_values = variable

            if grad_values is not None:
                var_name = variable.name.replace(":", "_")
                if "gradients" in summaries:
                    summary.histogram("gradients/%s" % var_name,
                                      mask_nans(grad_values))
                if "gradient_norm" in summaries:
                    summary.scalar("gradient_norm/%s" % var_name,
                                   clip_ops.global_norm([grad_values]))
                if "variables" in summaries:
                    summary.histogram("variables/%s" % var_name, var_values)
                if "variable_norm" in summaries:
                    summary.scalar("variable_norm/%s" % var_name,
                                   clip_ops.global_norm([var_values]))

        if clip_gradients is not None and ("global_gradient_norm" in summaries
                                           or "gradient_norm" in summaries):
            summary.scalar(
                "global_norm/clipped_gradient_norm",
                clip_ops.global_norm(
                    list(
                        map(lambda x: tf.cast(x, tf.float32),
                            list(zip(*gradients))[0]))),
            )

        # LARC gradient re-scaling
        if LARC_nu is not None and isinstance(LARC_nu, float):
            for idx, (g, v) in enumerate(gradients):
                var_dtype = v.dtype
                v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
                g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

                zero_const = tf.constant(0.0, dtype=tf.float32)
                # this condition is necessary for two reasons.
                # First, g_norm can be zero. In that case we can not use the usual
                # formula, but it does not matter what larc_local_lr is, since it will
                # be multiplied by g = 0.
                # Second, v_norm can be zero. In that case, we want to switch to the
                # usual gradient descent since the usual LARC update will always be 0.
                # Thus, we make larc_local_lr = 1e-5 so that
                # we move a little bit away from zero and can use LARC again
                larc_local_lr = tf.cond(
                    pred=tf.logical_and(
                        tf.greater(v_norm, zero_const),
                        tf.greater(g_norm, zero_const),
                    ),
                    true_fn=lambda: LARC_nu * v_norm / g_norm,
                    false_fn=lambda: tf.Print(
                        tf.constant(1e-5, dtype=tf.float32), [],
                        "g_norm = 0 or v_norm = 0 for {}".format(v.name)),
                )
                if LARC_mode == 'clip':
                    summary.scalar(
                        'switched_to_global/{}'.format(v.name),
                        tf.cast(tf.greater(larc_local_lr, lr), tf.int32))
                    larc_local_lr = tf.cond(
                        pred=tf.less(larc_local_lr, lr),
                        true_fn=lambda: larc_local_lr / lr,
                        false_fn=lambda: tf.constant(1.0, dtype=tf.float32),
                    )
                larc_local_lr = tf.saturate_cast(larc_local_lr, var_dtype)
                summary.scalar('larc_local_lr/{}'.format(v.name),
                               larc_local_lr)
                summary.scalar('larc_effective_lr/{}'.format(v.name),
                               larc_local_lr * tf.cast(lr, var_dtype))
                gradients[idx] = (larc_local_lr * g, v)

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # # Ensure the train_tensor computes grad_updates.
        train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

        return train_tensor
Example #22
0
def _add_hidden_layer_summary(value, tag):
  summary.scalar('%s/fraction_of_zero_values' % tag, nn.zero_fraction(value))
  summary.histogram('%s/activation' % tag, value)
Example #23
0
 def testHistogramSummaryTypes(self):
     for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
                   dtypes.float32, dtypes.float64):
         const = constant_op.constant(10, dtype=dtype)
         summary_lib.histogram('h', const)
Example #24
0
 def testHistogramSummaryTypes(self):
   with ops.Graph().as_default():
     for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
                   dtypes.float32, dtypes.float64):
       const = constant_op.constant(10, dtype=dtype)
       summary.histogram("h", const)
  def build_controller(self):
    """RL optimization interface.

    Returns:
      ops: A dictionary holding handles of the model used for training.
    """

    self._global_step = training_util.get_or_create_global_step()
    ops = {}
    ops["loss"] = 0

    failing_signal = self.compute_reward(self.hparams.failing_signal)

    ctr = {}

    with tf_ops.name_scope("controller_{}".format(self.ctrl_id)):
      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["reward"] = {"value": [], "ph": [], "update": []}
        ctr["ready"] = {"value": [], "ph": [], "update": []}
        ctr["best_reward"] = {"value": [], "update": []}
        for i in range(self.hparams.num_children):
          reward_value = variable_scope.get_local_variable(
              "reward_{}".format(i),
              initializer=0.0,
              dtype=dtypes.float32,
              trainable=False)
          reward_ph = array_ops.placeholder(
              dtypes.float32, shape=(), name="reward_ph_{}".format(i))
          reward_update = state_ops.assign(
              reward_value, reward_ph, use_locking=True)
          ctr["reward"]["value"].append(reward_value)
          ctr["reward"]["ph"].append(reward_ph)
          ctr["reward"]["update"].append(reward_update)
          best_reward = variable_scope.get_local_variable(
              "best_reward_{}".format(i),
              initializer=failing_signal,
              dtype=dtypes.float32,
              trainable=False)
          ctr["best_reward"]["value"].append(best_reward)
          ctr["best_reward"]["update"].append(
              state_ops.assign(best_reward,
                               math_ops.minimum(best_reward, reward_update)))

          ready_value = variable_scope.get_local_variable(
              "ready_{}".format(i),
              initializer=True,
              dtype=dtypes.bool,
              trainable=False)
          ready_ph = array_ops.placeholder(
              dtypes.bool, shape=(), name="ready_ph_{}".format(i))
          ready_update = state_ops.assign(
              ready_value, ready_ph, use_locking=True)
          ctr["ready"]["value"].append(ready_value)
          ctr["ready"]["ph"].append(ready_ph)
          ctr["ready"]["update"].append(ready_update)

      ctr["grouping_y_preds"], ctr["grouping_log_probs"] = self.get_groupings()
      summary.histogram(
          "grouping_actions",
          array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0],
                          [1, array_ops.shape(self.op_embeddings)[0]]))

      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["baseline"] = variable_scope.get_local_variable(
            "baseline",
            initializer=failing_signal
            if self.hparams.start_with_failing_signal else 0.0,
            dtype=dtypes.float32,
            trainable=False)

      new_baseline = self.hparams.bl_dec * ctr["baseline"] + (
          1 - self.hparams.bl_dec) * math_ops.reduce_mean(
              ctr["reward"]["value"])
      if not self.hparams.always_update_baseline:
        baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal)
        selected_reward = array_ops.boolean_mask(ctr["reward"]["value"],
                                                 baseline_mask)
        selected_baseline = control_flow_ops.cond(
            math_ops.reduce_any(baseline_mask),
            lambda: math_ops.reduce_mean(selected_reward),
            lambda: constant_op.constant(0, dtype=dtypes.float32))
        ctr["pos_reward"] = selected_baseline
        pos_ = math_ops.less(
            constant_op.constant(0, dtype=dtypes.float32), selected_baseline)
        selected_baseline = self.hparams.bl_dec * ctr["baseline"] + (
            1 - self.hparams.bl_dec) * selected_baseline
        selected_baseline = control_flow_ops.cond(
            pos_, lambda: selected_baseline, lambda: ctr["baseline"])
        new_baseline = control_flow_ops.cond(
            math_ops.less(self.global_step,
                          self.hparams.stop_updating_after_steps),
            lambda: new_baseline, lambda: selected_baseline)
      ctr["baseline_update"] = state_ops.assign(
          ctr["baseline"], new_baseline, use_locking=True)

      ctr["y_preds"], ctr["log_probs"] = self.get_placements()
      summary.histogram("actions", ctr["y_preds"]["sample"])
      mask = math_ops.less(ctr["reward"]["value"], failing_signal)
      ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"]
      ctr["loss"] *= (
          ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"])

      selected_loss = array_ops.boolean_mask(ctr["loss"], mask)
      selected_loss = control_flow_ops.cond(
          math_ops.reduce_any(mask),
          lambda: math_ops.reduce_mean(-selected_loss),
          lambda: constant_op.constant(0, dtype=dtypes.float32))

      ctr["loss"] = control_flow_ops.cond(
          math_ops.less(self.global_step,
                        self.hparams.stop_updating_after_steps),
          lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss)

      ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"])
      summary.scalar("loss", ctr["loss"])
      summary.scalar("avg_reward", ctr["reward_s"])
      summary.scalar("best_reward_so_far", best_reward)
      summary.scalar(
          "advantage",
          math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"]))

    with variable_scope.variable_scope(
        "optimizer", reuse=variable_scope.AUTO_REUSE):
      (ctr["train_op"], ctr["lr"], ctr["grad_norm"],
       ctr["grad_norms"]) = self._get_train_ops(
           ctr["loss"],
           tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES),
           self.global_step,
           grad_bound=self.hparams.grad_bound,
           lr_init=self.hparams.lr,
           lr_dec=self.hparams.lr_dec,
           start_decay_step=self.hparams.start_decay_step,
           decay_steps=self.hparams.decay_steps,
           optimizer_type=self.hparams.optimizer_type)

    summary.scalar("gradnorm", ctr["grad_norm"])
    summary.scalar("lr", ctr["lr"])
    ctr["summary"] = summary.merge_all()
    ops["controller"] = ctr

    self.ops = ops
    return ops
Example #26
0
  def set_model(self, model):
    """Sets Keras model and creates summary ops."""

    self.model = model
    self.sess = K.get_session()
    # only make histogram summary op if it hasn't already been made
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if self.write_grads:
          for weight in layer.trainable_weights:
            mapped_weight_name = weight.name.replace(':', '_')
            grads = model.optimizer.get_gradients(model.total_loss, weight)

            def is_indexed_slices(grad):
              return type(grad).__name__ == 'IndexedSlices'

            grads = [grad.values if is_indexed_slices(grad) else grad
                     for grad in grads]
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)

        if hasattr(layer, 'output'):
          if isinstance(layer.output, list):
            for i, output in enumerate(layer.output):
              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
          else:
            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = self._writer_class(self.log_dir, self.sess.graph)
    else:
      self.writer = self._writer_class(self.log_dir)

    # If both embedding_freq and embeddings_data are available, we will
    # visualize embeddings.
    if self.embeddings_freq and self.embeddings_data is not None:
      self.embeddings_data = standardize_input_data(self.embeddings_data,
                                                    model.input_names)

      # If embedding_layer_names are not provided, get all of the embedding
      # layers from the model.
      embeddings_layer_names = self.embeddings_layer_names
      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name
            for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      self.assign_embeddings = []
      embeddings_vars = {}

      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
      self.step = step = array_ops.placeholder(dtypes.int32)

      for layer in self.model.layers:
        if layer.name in embeddings_layer_names:
          embedding_input = self.model.get_layer(layer.name).output
          embedding_size = np.prod(embedding_input.shape[1:])
          embedding_input = array_ops.reshape(embedding_input,
                                              (step, int(embedding_size)))
          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
          embedding = variables.Variable(
              array_ops.zeros(shape), name=layer.name + '_embedding')
          embeddings_vars[layer.name] = embedding
          batch = state_ops.assign(embedding[batch_id:batch_id + step],
                                   embedding_input)
          self.assign_embeddings.append(batch)

      self.saver = saver.Saver(list(embeddings_vars.values()))

      # Create embeddings_metadata dictionary
      if isinstance(self.embeddings_metadata, str):
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings_vars.keys()
        }
      else:
        # If embedding_metadata is already a dictionary
        embeddings_metadata = self.embeddings_metadata

      try:
        from tensorboard.plugins import projector
      except ImportError:
        raise ImportError('Failed to import TensorBoard. Please make sure that '
                          'TensorBoard integration is complete."')

      # TODO(psv): Add integration tests to test embedding visualization
      # with TensorBoard callback. We are unable to write a unit test for this
      # because TensorBoard dependency assumes TensorFlow package is installed.
      config = projector.ProjectorConfig()
      for layer_name, tensor in embeddings_vars.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if (embeddings_metadata is not None and
            layer_name in embeddings_metadata):
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
  def build_controller(self):
    """RL optimization interface.

    Returns:
      ops: A dictionary holding handles of the model used for training.
    """

    self._global_step = training_util.get_or_create_global_step()
    ops = {}
    ops["loss"] = 0

    failing_signal = self.compute_reward(self.hparams.failing_signal)

    ctr = {}

    with tf_ops.name_scope("controller_{}".format(self.ctrl_id)):
      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["reward"] = {"value": [], "ph": [], "update": []}
        ctr["ready"] = {"value": [], "ph": [], "update": []}
        ctr["best_reward"] = {"value": [], "update": []}
        for i in range(self.hparams.num_children):
          reward_value = variable_scope.get_local_variable(
              "reward_{}".format(i),
              initializer=0.0,
              dtype=dtypes.float32,
              trainable=False)
          reward_ph = array_ops.placeholder(
              dtypes.float32, shape=(), name="reward_ph_{}".format(i))
          reward_update = state_ops.assign(
              reward_value, reward_ph, use_locking=True)
          ctr["reward"]["value"].append(reward_value)
          ctr["reward"]["ph"].append(reward_ph)
          ctr["reward"]["update"].append(reward_update)
          best_reward = variable_scope.get_local_variable(
              "best_reward_{}".format(i),
              initializer=failing_signal,
              dtype=dtypes.float32,
              trainable=False)
          ctr["best_reward"]["value"].append(best_reward)
          ctr["best_reward"]["update"].append(
              state_ops.assign(best_reward,
                               math_ops.minimum(best_reward, reward_update)))

          ready_value = variable_scope.get_local_variable(
              "ready_{}".format(i),
              initializer=True,
              dtype=dtypes.bool,
              trainable=False)
          ready_ph = array_ops.placeholder(
              dtypes.bool, shape=(), name="ready_ph_{}".format(i))
          ready_update = state_ops.assign(
              ready_value, ready_ph, use_locking=True)
          ctr["ready"]["value"].append(ready_value)
          ctr["ready"]["ph"].append(ready_ph)
          ctr["ready"]["update"].append(ready_update)

      ctr["grouping_y_preds"], ctr["grouping_log_probs"] = self.get_groupings()
      summary.histogram(
          "grouping_actions",
          array_ops.slice(ctr["grouping_y_preds"]["sample"], [0, 0],
                          [1, array_ops.shape(self.op_embeddings)[0]]))

      with variable_scope.variable_scope("controller_{}".format(self.ctrl_id)):
        ctr["baseline"] = variable_scope.get_local_variable(
            "baseline",
            initializer=failing_signal
            if self.hparams.start_with_failing_signal else 0.0,
            dtype=dtypes.float32,
            trainable=False)

      new_baseline = self.hparams.bl_dec * ctr["baseline"] + (
          1 - self.hparams.bl_dec) * math_ops.reduce_mean(
              ctr["reward"]["value"])
      if not self.hparams.always_update_baseline:
        baseline_mask = math_ops.less(ctr["reward"]["value"], failing_signal)
        selected_reward = array_ops.boolean_mask(ctr["reward"]["value"],
                                                 baseline_mask)
        selected_baseline = control_flow_ops.cond(
            math_ops.reduce_any(baseline_mask),
            lambda: math_ops.reduce_mean(selected_reward),
            lambda: constant_op.constant(0, dtype=dtypes.float32))
        ctr["pos_reward"] = selected_baseline
        pos_ = math_ops.less(
            constant_op.constant(0, dtype=dtypes.float32), selected_baseline)
        selected_baseline = self.hparams.bl_dec * ctr["baseline"] + (
            1 - self.hparams.bl_dec) * selected_baseline
        selected_baseline = control_flow_ops.cond(
            pos_, lambda: selected_baseline, lambda: ctr["baseline"])
        new_baseline = control_flow_ops.cond(
            math_ops.less(self.global_step,
                          self.hparams.stop_updating_after_steps),
            lambda: new_baseline, lambda: selected_baseline)
      ctr["baseline_update"] = state_ops.assign(
          ctr["baseline"], new_baseline, use_locking=True)

      ctr["y_preds"], ctr["log_probs"] = self.get_placements()
      summary.histogram("actions", ctr["y_preds"]["sample"])
      mask = math_ops.less(ctr["reward"]["value"], failing_signal)
      ctr["loss"] = ctr["reward"]["value"] - ctr["baseline"]
      ctr["loss"] *= (
          ctr["log_probs"]["sample"] + ctr["grouping_log_probs"]["sample"])

      selected_loss = array_ops.boolean_mask(ctr["loss"], mask)
      selected_loss = control_flow_ops.cond(
          math_ops.reduce_any(mask),
          lambda: math_ops.reduce_mean(-selected_loss),
          lambda: constant_op.constant(0, dtype=dtypes.float32))

      ctr["loss"] = control_flow_ops.cond(
          math_ops.less(self.global_step,
                        self.hparams.stop_updating_after_steps),
          lambda: math_ops.reduce_mean(-ctr["loss"]), lambda: selected_loss)

      ctr["reward_s"] = math_ops.reduce_mean(ctr["reward"]["value"])
      summary.scalar("loss", ctr["loss"])
      summary.scalar("avg_reward", ctr["reward_s"])
      summary.scalar("best_reward_so_far", best_reward)
      summary.scalar(
          "advantage",
          math_ops.reduce_mean(ctr["reward"]["value"] - ctr["baseline"]))

    with variable_scope.variable_scope(
        "optimizer", reuse=variable_scope.AUTO_REUSE):
      (ctr["train_op"], ctr["lr"], ctr["grad_norm"],
       ctr["grad_norms"]) = self._get_train_ops(
           ctr["loss"],
           tf_ops.get_collection(tf_ops.GraphKeys.TRAINABLE_VARIABLES),
           self.global_step,
           grad_bound=self.hparams.grad_bound,
           lr_init=self.hparams.lr,
           lr_dec=self.hparams.lr_dec,
           start_decay_step=self.hparams.start_decay_step,
           decay_steps=self.hparams.decay_steps,
           optimizer_type=self.hparams.optimizer_type)

    summary.scalar("gradnorm", ctr["grad_norm"])
    summary.scalar("lr", ctr["lr"])
    ctr["summary"] = summary.merge_all()
    ops["controller"] = ctr

    self.ops = ops
    return ops
 def _add_hidden_layer_summary(self, value, tag):
   # TODO(zakaria): Move this code to tf.learn and add test.
   summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
   summary.histogram("%s/activation" % tag, value)
 def _add_hidden_layer_summary(self, value, tag):
     summary.scalar('%s/fraction_of_zero_values' % tag,
                    tf.nn.zero_fraction(value))
     summary.histogram('%s/activation' % tag, value)
def _add_hidden_layer_summary(value, tag):
  summary.scalar("%s_fraction_of_zero_values" % tag, nn.zero_fraction(value))
  summary.histogram("%s_activation" % tag, value)
Example #31
0
 def testHistogramSummaryTypes(self):
   for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
                 dtypes.float32, dtypes.float64):
     const = constant_op.constant(10, dtype=dtype)
     summary_lib.histogram('h', const)
 def _add_hidden_layer_summary(self, value, tag):
     # TODO(zakaria): Move this code to tf.learn and add test.
     summary.scalar("%s/fraction_of_zero_values" % tag,
                    nn.zero_fraction(value))
     summary.histogram("%s/activation" % tag, value)
Example #33
0
def optimize_loss(loss,
                  optimizer,
                  optimizer_params,
                  learning_rate_decay_fn,
                  global_step=None,
                  dtype=tf.float32,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  larc_params=None,
                  loss_scale=1.0,
                  automatic_loss_scaling=None,
                  on_horovod=False):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers include:

  - by string specifying the name of the optimizer. See OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - by function taking learning rate `Tensor` as argument and returning an
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - by a subclass of `Optimizer` having a single-argument constructor
      (the argument is the learning rate), such as AdamOptimizer or
      AdagradOptimizer. E.g. `optimize_loss(...,
      optimizer=tf.train.AdagradOptimizer)`.
  - by an instance of a subclass of `Optimizer`.
      E.g., `optimize_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it has
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.
    LARC_mode: 'scale' or 'clip'
    LARC_nu: If not None, LARC re-scaling will be
             applied https://arxiv.org/pdf/1708.03888.pdf with nu=LARC_nu
    automatic_loss_scaling: if not None, use the corresponding automatic
                            loss scaling algorithm. Must be one of 'Backoff'
                            of 'LogMax'. `dtype` must be "mixed" to use ALS.
  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` has the wrong type.
        * `clip_gradients` is neither float nor callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty.
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = tf.train.get_or_create_global_step()
  else:
    tf.train.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    if summaries is None:
      summaries = ["learning_rate", "global_gradient_norm"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if global_step is None:
      raise ValueError("global_step is required for learning_rate_decay_fn.")
    lr = learning_rate_decay_fn(global_step)

    if "learning_rate" in summaries:
      summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr, **optimizer_params)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr, **optimizer_params)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if lr is not None:
        opt = optimizer(lr, **optimizer_params)
      else:
        opt = optimizer(**optimizer_params)
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))
    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    if automatic_loss_scaling is not None:
      if automatic_loss_scaling not in AutomaticLossScaler.SUPPORTED_ALGOS:
        raise ValueError("Unknown automatic loss scaling algorithm: %s."
                         % automatic_loss_sclaing)
      if dtype != "mixed":
        raise ValueError("Automatic loss scaling can be used only with "
                         "dtype=mixed.")
      loss_scale = AutomaticLossScaler(algorithm=automatic_loss_scaling)

    if dtype == 'mixed':
      opt = MixedPrecisionOptimizerWrapper(opt, loss_scale=loss_scale)
    if on_horovod:
      opt = DistributedOptimizer(opt)

    # Compute gradients.
    gradients = opt.compute_gradients(
      loss, variables,
      colocate_gradients_with_ops=colocate_gradients_with_ops,
    )

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
      summary.scalar(
        "global_norm/gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # Optionally clip gradients by global norm.
    if clip_gradients is not None and larc_params is not None:
      raise AttributeError(
        "LARC and gradient norm clipping should not be used together"
      )
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if isinstance(variable, ops.IndexedSlices):
        var_values = variable.values
      else:
        var_values = variable

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, mask_nans(grad_values))
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))
        if "variables" in summaries:
          summary.histogram("variables/%s" % var_name, var_values)
        if "variable_norm" in summaries:
          summary.scalar("variable_norm/%s" % var_name,
                         clip_ops.global_norm([var_values]))

    if clip_gradients is not None and ("global_gradient_norm" in summaries or
                                       "gradient_norm" in summaries):
      summary.scalar(
        "global_norm/clipped_gradient_norm",
        clip_ops.global_norm(list(map(
          lambda x: tf.cast(x, tf.float32),
          list(zip(*gradients))[0])
        )),
      )

    # LARC gradient re-scaling
    if larc_params is not None:
      check_params(
        config=larc_params,
        required_dict={'larc_eta': float},
        optional_dict={
          'larc_mode': ['clip', 'scale'],
          'min_update': float,
          'epsilon': float
        },
      )
      larc_eta = larc_params['larc_eta']
      larc_mode = larc_params.get('larc_mode', 'clip')
      min_update = larc_params.get('min_update', 1e-7)
      eps = larc_params.get('epsilon', 1e-7)

      for idx, (g, v) in enumerate(gradients):
        var_dtype = v.dtype
        v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
        g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

        if larc_mode == 'clip':
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (lr * (g_norm + eps)),
            min_update,
          )
          if "larc_summaries" in summaries:
            summary.scalar('larc_clip_on/{}'.format(v.name),
                           tf.cast(tf.less(larc_grad_update, 1.0), tf.int32))
          larc_grad_update = tf.minimum(larc_grad_update, 1.0)
        else:
          larc_grad_update = tf.maximum(
            larc_eta * v_norm / (g_norm + eps),
            min_update,
          )
        larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
        gradients[idx] = (larc_grad_update * g, v)

        # adding additional summary
        if "larc_summaries" in summaries:
          summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update)
          summary.scalar("larc_final_lr/{}".format(v.name),
                         tf.cast(lr, var_dtype) * larc_grad_update)

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
Example #34
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:
        for weight in layer.weights:
          mapped_weight_name = weight.name.replace(':', '_')
          tf_summary.histogram(mapped_weight_name, weight)
          if self.write_grads:
            grads = model.optimizer.get_gradients(model.total_loss, weight)
            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = K.int_shape(w_img)
            if len(shape) == 2:  # dense layer kernel case
              if shape[0] > shape[1]:
                w_img = array_ops.transpose(w_img)
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
            elif len(shape) == 3:  # convnet case
              if K.image_data_format() == 'channels_last':
                # switch to channels_first to display
                # every kernel as a separate image
                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
                shape = K.int_shape(w_img)
              w_img = array_ops.reshape(w_img,
                                        [shape[0], shape[1], shape[2], 1])
            elif len(shape) == 1:  # bias case
              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
            else:
              # not possible to handle 3D convnets etc.
              continue

            shape = K.int_shape(w_img)
            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
            tf_summary.image(mapped_weight_name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)

    if self.embeddings_freq:
      embeddings_layer_names = self.embeddings_layer_names

      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      embeddings = {
          layer.name: layer.weights[0]
          for layer in self.model.layers if layer.name in embeddings_layer_names
      }

      self.saver = saver_lib.Saver(list(embeddings.values()))

      embeddings_metadata = {}

      if not isinstance(self.embeddings_metadata, str):
        embeddings_metadata = self.embeddings_metadata
      else:
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings.keys()
        }

      config = projector.ProjectorConfig()
      self.embeddings_ckpt_path = os.path.join(self.log_dir,
                                               'keras_embedding.ckpt')

      for layer_name, tensor in embeddings.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        if layer_name in embeddings_metadata:
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
Example #35
0
def optimize_loss(loss,
                  global_step,
                  learning_rate,
                  optimizer,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=None,
                  colocate_gradients_with_ops=False,
                  increment_global_step=True):
  """Given loss and parameters for optimizer, returns a training op.

  Various ways of passing optimizers, include:

  - string, name of the optimizer like 'SGD', 'Adam', see OPTIMIZER_CLS_NAMES
      for full list. E.g. `optimize_loss(..., optimizer='Adam')`.
  - function, takes learning rate `Tensor` as argument and must return
      `Optimizer` instance. E.g. `optimize_loss(...,
      optimizer=lambda lr: tf.train.MomentumOptimizer(lr, momentum=0.5))`.
    Alternatively, if `learning_rate` is `None`, the function takes no
    arguments. E.g. `optimize_loss(..., learning_rate=None,
      optimizer=lambda: tf.train.MomentumOptimizer(0.5, momentum=0.5))`.
  - class, subclass of `Optimizer` that takes only one required argument -
      learning rate, such as AdamOptimizer, AdagradOptimizer.
      E.g. `optimize_loss(..., optimizer=tf.train.AdagradOptimizer)`.
  - object, instance of subclass of `Optimizer`.
      E.g., `optimizer_loss(..., optimizer=tf.train.AdagradOptimizer(0.5))`.

  Args:
    loss: Scalar `Tensor`.
    global_step: Scalar int `Tensor`, step counter to update on each step
                 unless `increment_global_step` is `False`. If not supplied,
                 it will be fetched from the default graph (see
                 `tf.train.get_global_step` for details). If it's
                 not been created, no step will be incremented with each weight
                 update. `learning_rate_decay_fn` requires `global_step`.
    learning_rate: float or `Tensor`, magnitude of update per each training
                   step. Can be `None`.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of `tf.Optimizer` that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantiation of `tf.Optimizer`
                 sub-class and have `compute_gradients` and `apply_gradients`
                 functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float, callable or `None`. If float, is provided, a global
      clipping is applied to prevent the norm of the gradient to exceed this
      value. Alternatively, a callable can be provided e.g.: adaptive_clipping.
      This callable takes a `list` of `(gradients, variables)` `tuple`s and
      returns the same thing with the gradients modified.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: `tf.train.exponential_decay`.
                            Ignored if `learning_rate` is not supplied.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection. The order of execution
                between `update_ops` and `loss` is non-deterministic.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
    colocate_gradients_with_ops: If True, try colocating gradients with the
                                 corresponding op.
    increment_global_step: Whether to increment `global_step`. If your model
      calls `optimize_loss` multiple times per training step (e.g. to optimize
      different parts of the model), use this arg to avoid incrementing
      `global_step` more times than necessary.

  Returns:
    Training op.

  Raises:
    ValueError: if:
        * `loss` is an invalid type or shape.
        * `global_step` is an invalid type or shape.
        * `learning_rate` is an invalid type or value.
        * `optimizer` is wrong type.
        * `clip_gradients` is not float or callable.
        * `learning_rate` and `learning_rate_decay_fn` are supplied, but no
          `global_step` is available.
        * `gradients` is empty
  """
  loss = ops.convert_to_tensor(loss)
  contrib_framework.assert_scalar(loss)
  if global_step is None:
    global_step = contrib_framework.get_global_step()
  else:
    contrib_framework.assert_global_step(global_step)
  with vs.variable_scope(name, "OptimizeLoss", [loss, global_step]):
    # Update ops take UPDATE_OPS collection if not provided.
    if update_ops is None:
      update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
    # Make sure update ops are ran before computing loss.
    if update_ops:
      loss = control_flow_ops.with_dependencies(list(update_ops), loss)

    # Learning rate variable, with possible decay.
    lr = None
    if learning_rate is not None:
      if (isinstance(learning_rate, ops.Tensor) and
          learning_rate.get_shape().ndims == 0):
        lr = learning_rate
      elif isinstance(learning_rate, float):
        if learning_rate < 0.0:
          raise ValueError("Invalid learning_rate %s.", learning_rate)
        lr = vs.get_variable(
            "learning_rate", [],
            trainable=False,
            initializer=init_ops.constant_initializer(learning_rate))
      else:
        raise ValueError("Learning rate should be 0d Tensor or float. "
                         "Got %s of type %s" % (str(learning_rate),
                                                str(type(learning_rate))))
    if summaries is None:
      summaries = ["loss", "learning_rate"]
    else:
      for summ in summaries:
        if summ not in OPTIMIZER_SUMMARIES:
          raise ValueError("Summaries should be one of [%s], you provided %s." %
                           (", ".join(OPTIMIZER_SUMMARIES), summ))
    if learning_rate is not None and learning_rate_decay_fn is not None:
      if global_step is None:
        raise ValueError("global_step is required for learning_rate_decay_fn.")
      lr = learning_rate_decay_fn(lr, global_step)
      if "learning_rate" in summaries:
        summary.scalar("learning_rate", lr)

    # Create optimizer, given specified parameters.
    if isinstance(optimizer, six.string_types):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is string (%s)." % optimizer)
      if optimizer not in OPTIMIZER_CLS_NAMES:
        raise ValueError(
            "Optimizer name should be one of [%s], you provided %s." %
            (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
      opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
    elif (isinstance(optimizer, type) and
          issubclass(optimizer, optimizer_.Optimizer)):
      if lr is None:
        raise ValueError("Learning rate is None, but should be specified if "
                         "optimizer is class (%s)." % optimizer)
      opt = optimizer(learning_rate=lr)
    elif isinstance(optimizer, optimizer_.Optimizer):
      opt = optimizer
    elif callable(optimizer):
      if learning_rate is not None:
        opt = optimizer(lr)
      else:
        opt = optimizer()
      if not isinstance(opt, optimizer_.Optimizer):
        raise ValueError("Unrecognized optimizer: function should return "
                         "subclass of Optimizer. Got %s." % str(opt))
    else:
      raise ValueError("Unrecognized optimizer: should be string, "
                       "subclass of Optimizer, instance of "
                       "subclass of Optimizer or function with one argument. "
                       "Got %s." % str(optimizer))

    # All trainable variables, if specific variables are not specified.
    if variables is None:
      variables = vars_.trainable_variables()

    # Compute gradients.
    gradients = opt.compute_gradients(
        loss,
        variables,
        colocate_gradients_with_ops=colocate_gradients_with_ops)

    # Optionally add gradient noise.
    if gradient_noise_scale is not None:
      gradients = _add_scaled_noise_to_gradients(gradients,
                                                 gradient_noise_scale)

    # Multiply some gradients.
    if gradient_multipliers is not None:
      gradients = _multiply_gradients(gradients, gradient_multipliers)
      if not gradients:
        raise ValueError(
            "Empty list of (gradient, var) pairs encountered. This is most "
            "likely to be caused by an improper value of gradient_multipliers.")

    if "gradient_norm" in summaries:
      summary.scalar("global_norm/gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Optionally clip gradients by global norm.
    if isinstance(clip_gradients, float):
      gradients = _clip_gradients_by_norm(gradients, clip_gradients)
    elif callable(clip_gradients):
      gradients = clip_gradients(gradients)
    elif clip_gradients is not None:
      raise ValueError(
          "Unknown type %s for clip_gradients" % type(clip_gradients))

    # Add scalar summary for loss.
    if "loss" in summaries:
      summary.scalar("loss", loss)

    # Add histograms for variables, gradients and gradient norms.
    for gradient, variable in gradients:
      if isinstance(gradient, ops.IndexedSlices):
        grad_values = gradient.values
      else:
        grad_values = gradient

      if grad_values is not None:
        var_name = variable.name.replace(":", "_")
        if "gradients" in summaries:
          summary.histogram("gradients/%s" % var_name, grad_values)
        if "gradient_norm" in summaries:
          summary.scalar("gradient_norm/%s" % var_name,
                         clip_ops.global_norm([grad_values]))

    if clip_gradients is not None and "gradient_norm" in summaries:
      summary.scalar("global_norm/clipped_gradient_norm",
                     clip_ops.global_norm(list(zip(*gradients))[0]))

    # Create gradient updates.
    grad_updates = opt.apply_gradients(
        gradients,
        global_step=global_step if increment_global_step else None,
        name="train")

    # Ensure the train_tensor computes grad_updates.
    train_tensor = control_flow_ops.with_dependencies([grad_updates], loss)

    return train_tensor
Example #36
0
    def set_model(self, model):
        """Sets Keras model and creates summary ops."""

        self.model = model
        self.sess = K.get_session()
        # only make histogram summary op if it hasn't already been made
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf_summary.histogram(mapped_weight_name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = array_ops.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = array_ops.transpose(w_img,
                                                            perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = array_ops.reshape(w_img,
                                                      [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf_summary.image(mapped_weight_name, w_img)

                if self.write_grads:
                    for weight in layer.trainable_weights:
                        mapped_weight_name = weight.name.replace(':', '_')
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf_summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)

                if hasattr(layer, 'output'):
                    if isinstance(layer.output, list):
                        for i, output in enumerate(layer.output):
                            tf_summary.histogram(
                                '{}_out_{}'.format(layer.name, i), output)
                    else:
                        tf_summary.histogram('{}_out'.format(layer.name),
                                             layer.output)
        self.merged = tf_summary.merge_all()

        if self.write_graph:
            self.writer = self._writer_class(self.log_dir, self.sess.graph)
        else:
            self.writer = self._writer_class(self.log_dir)

        # If both embedding_freq and embeddings_data are available, we will
        # visualize embeddings.
        if self.embeddings_freq and self.embeddings_data is not None:
            self.embeddings_data = standardize_input_data(
                self.embeddings_data, model.input_names)

            # If embedding_layer_names are not provided, get all of the embedding
            # layers from the model.
            embeddings_layer_names = self.embeddings_layer_names
            if not embeddings_layer_names:
                embeddings_layer_names = [
                    layer.name for layer in self.model.layers
                    if type(layer).__name__ == 'Embedding'
                ]

            self.assign_embeddings = []
            embeddings_vars = {}

            self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
            self.step = step = array_ops.placeholder(dtypes.int32)

            for layer in self.model.layers:
                if layer.name in embeddings_layer_names:
                    embedding_input = self.model.get_layer(layer.name).output
                    embedding_size = np.prod(embedding_input.shape[1:])
                    embedding_input = array_ops.reshape(
                        embedding_input, (step, int(embedding_size)))
                    shape = (self.embeddings_data[0].shape[0],
                             int(embedding_size))
                    embedding = variables.Variable(array_ops.zeros(shape),
                                                   name=layer.name +
                                                   '_embedding')
                    embeddings_vars[layer.name] = embedding
                    batch = state_ops.assign(
                        embedding[batch_id:batch_id + step], embedding_input)
                    self.assign_embeddings.append(batch)

            self.saver = saver.Saver(list(embeddings_vars.values()))

            # Create embeddings_metadata dictionary
            if isinstance(self.embeddings_metadata, str):
                embeddings_metadata = {
                    layer_name: self.embeddings_metadata
                    for layer_name in embeddings_vars.keys()
                }
            else:
                # If embedding_metadata is already a dictionary
                embeddings_metadata = self.embeddings_metadata

            try:
                from tensorboard.plugins import projector
            except ImportError:
                raise ImportError(
                    'Failed to import TensorBoard. Please make sure that '
                    'TensorBoard integration is complete."')

            # TODO(psv): Add integration tests to test embedding visualization
            # with TensorBoard callback. We are unable to write a unit test for this
            # because TensorBoard dependency assumes TensorFlow package is installed.
            config = projector.ProjectorConfig()
            for layer_name, tensor in embeddings_vars.items():
                embedding = config.embeddings.add()
                embedding.tensor_name = tensor.name

                if (embeddings_metadata is not None
                        and layer_name in embeddings_metadata):
                    embedding.metadata_path = embeddings_metadata[layer_name]

            projector.visualize_embeddings(self.writer, config)
Example #37
0
def optimize_loss(losses,
                  global_step,
                  learning_rate,
                  optimizer,
                  num_gpus=1,
                  gradient_noise_scale=None,
                  gradient_multipliers=None,
                  clip_gradients=None,
                  learning_rate_decay_fn=None,
                  update_ops=None,
                  variables=None,
                  name=None,
                  summaries=["global_gradient_norm"],
                  colocate_gradients_with_ops=False,
                  increment_global_step=True,
                  use_tpu=False,
                  use_horovod=False):
    """Given loss and parameters for optimizer, returns a training op.
  Args:
    loss: Tensor, 0 dimensional.
    global_step: Tensor, step counter for each update.
    learning_rate: float or Tensor, magnitude of update per each training step.
    optimizer: string, class or optimizer instance, used as trainer.
               string should be name of optimizer, like 'SGD',
                 'Adam', 'Adagrad'. Full list in OPTIMIZER_CLS_NAMES constant.
               class should be sub-class of tf.Optimizer that implements
                 `compute_gradients` and `apply_gradients` functions.
               optimizer instance should be instantion of tf.Optimizer sub-class
                 and have `compute_gradients` and `apply_gradients` functions.
    gradient_noise_scale: float or None, adds 0-mean normal noise scaled by this
                          value.
    gradient_multipliers: dict of variables or variable names to floats.
                          If present, gradients for specified
                          variables will be multiplied by given constant.
    clip_gradients: float or `None`, clips gradients by this value.
    moving_average_decay: Deprecated. float or None, takes into account previous
                          loss to make learning smoother due to outliers.
    learning_rate_decay_fn: function, takes `learning_rate` and `global_step`
                            `Tensor`s, returns `Tensor`.
                            Can be used to implement any learning rate decay
                            functions.
                            For example: tf.train.exponential_decay.
    update_ops: list of update `Operation`s to execute at each step. If `None`,
                uses elements of UPDATE_OPS collection.
    variables: list of variables to optimize or
               `None` to use all trainable variables.
    name: The name for this operation is used to scope operations and summaries.
    summaries: List of internal quantities to visualize on tensorboard. If not
               set only the loss and the learning rate will be reported. The
               complete list is in OPTIMIZER_SUMMARIES.
  Returns:
    Training op.
  Raises:
    ValueError: if optimizer is wrong type.
  """
    with vs.variable_scope(name, "OptimizeLoss", losses + [global_step]):
        # Update ops take UPDATE_OPS collection if not provided.
        if update_ops is None:
            update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))

        #--from https://github.com/tensorflow/tensorflow/blob/28c3c5dd38e3b397c2cf0acdaa6388dcbf0349f7/tensorflow/contrib/layers/python/layers/optimizers.py
        # Learning rate variable, with possible decay.
        lr = None
        if learning_rate is not None:
            if (isinstance(learning_rate, ops.Tensor)
                    or isinstance(learning_rate, tf.Variable)
                    and learning_rate.get_shape().ndims == 0):
                #print('------------------optimize_loss learning rate do nothhing', learning_rate)
                lr = learning_rate
            elif isinstance(learning_rate, float):
                if learning_rate < 0.0:
                    raise ValueError("Invalid learning_rate %s.",
                                     learning_rate)
                lr = vs.get_variable(
                    "learning_rate", [],
                    trainable=False,
                    initializer=init_ops.constant_initializer(learning_rate))
            else:
                raise ValueError(
                    "Learning rate should be 0d Tensor or float. "
                    "Got %s of type %s" %
                    (str(learning_rate), str(type(learning_rate))))

        if learning_rate is not None and learning_rate_decay_fn is not None:
            if global_step is None:
                raise ValueError(
                    "global_step is required for learning_rate_decay_fn.")
            lr = learning_rate_decay_fn(lr, global_step)

        # Create optimizer, given specified parameters.
        if isinstance(optimizer, six.string_types):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is string (%s)." % optimizer)
            if optimizer not in OPTIMIZER_CLS_NAMES:
                raise ValueError(
                    "Optimizer name should be one of [%s], you provided %s." %
                    (", ".join(OPTIMIZER_CLS_NAMES), optimizer))
            opt = OPTIMIZER_CLS_NAMES[optimizer](learning_rate=lr)
        elif (isinstance(optimizer, type)
              and issubclass(optimizer, optimizer_.Optimizer)):
            if lr is None:
                raise ValueError(
                    "Learning rate is None, but should be specified if "
                    "optimizer is class (%s)." % optimizer)
            opt = optimizer(learning_rate=lr)
        elif isinstance(optimizer, optimizer_.Optimizer):
            #print('------------------optimize_loss optimizer do nothing', optimizer)
            opt = optimizer
        elif callable(optimizer):
            if learning_rate is not None:
                opt = optimizer(lr)
            else:
                opt = optimizer()
            if not isinstance(opt, optimizer_.Optimizer):
                pass
                # TODO all tf.keras.optimizers
                #raise ValueError("Unrecognized optimizer: function should return "
                #                 "subclass of Optimizer. Got %s." % str(opt))
        else:
            raise ValueError(
                "Unrecognized optimizer: should be string, "
                "subclass of Optimizer, instance of "
                "subclass of Optimizer or function with one argument. "
                "Got %s." % str(optimizer))

        if use_tpu:
            opt = tf.contrib.tpu.CrossShardOptimizer(opt)
        assert not (use_tpu and use_horovod)
        if use_horovod:
            if FLAGS.torch:
                import horovod.torch as hvd
            else:
                import horovod.tensorflow as hvd
            #https://blog.csdn.net/qq_16234613/article/details/96186398
            # we enable compression only for fp16
            #import horovod.tensorflow as hvd
            #from horovod.tensorflow.compression import Compression
            # if use_fp16:
            #     compression = Compression.fp16
            # else:
            #     compression = Compression.none
            # opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True,t7
            #                                compression=compression)
            opt = hvd.DistributedOptimizer(opt)

            ## just use opt.minize 0.1 epoch 64 *  8 gpu  auc 0.717 if cmment this line using gradiens.. auc 0.7186
            #return opt.minimize(losses[0], global_step=global_step if increment_global_step else None)

        if num_gpus > 1:
            # Calculate the gradients for each model tower.
            # TODO check below is all ok, right now single gpu using below code will be slower then tf.contrib.optimize_loss  4.5 batch/s -> 3 batch/s
            tower_grads = []
            for i in range(num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % ('tower', i)) as name_scope:
                        # All trainable variables, if specific variables are not specified.

                        #-- TODO trainable_variables affect speed ?
                        if variables is None:
                            variables = vars_.trainable_variables()
                        # Compute gradients.
                        loss = losses[i]
                        # if update_ops:
                        #   loss = control_flow_ops.with_dependencies(list(update_ops), loss)
                        #print('------------',)
                        try:
                            gradients = opt.compute_gradients(
                                loss,
                                variables,
                                colocate_gradients_with_ops=
                                colocate_gradients_with_ops)
                        except Exception:
                            # try:
                            #   gradients = opt.compute_gradients(loss)
                            # except Exception:
                            gradients = opt.get_updates(loss, params=variables)

                        #TODO FIXME might have None for example add another predictor to graph
                        #[(None, <tf.Variable 'dual_bow/model_init/emb:0' shape=(29285, 256) dtype=float32_ref>),
                        #(None, <tf.Variable 'dual_bow/main/dual_textsim/encode/text_mlp/linear/weights:0' shape=(256, 256) dtype=float32_ref>),
                        #(<tensorflow.python.framework.ops.IndexedSlices object at 0x1b72ff50>, <tf.Variable 'seq2seq/model_init/emb:0' shape=(29285, 256) dtype=float32_ref>)
                        #print('-------gradients1', gradients)
                        #--now hack use below, TODO why dual_bow.. in introduced when compute gradient of loss as seem not related my seq2seq loss?
                        gradients = [x for x in gradients if x[0] is not None]
                        # Optionally add gradient noise.
                        if gradient_noise_scale is not None:
                            gradients = _add_scaled_noise_to_gradients(
                                gradients, gradient_noise_scale)
                        # Multiply some gradients.
                        if gradient_multipliers is not None:
                            gradients = _multiply_gradients(
                                gradients, gradient_multipliers)
                        # Optionally clip gradients by global norm.
                        if clip_gradients is not None:
                            gradients = _clip_gradients_by_norm(
                                gradients, clip_gradients)

                        #print('-------gradients', gradients)
                        tower_grads.append(gradients)

            gradients = average_gradients(tower_grads)
            if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
                summary.scalar("global_norm/gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

            # Add histograms for variables, gradients and gradient norms.
            for gradient, variable in gradients:
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                if grad_values is not None:
                    var_name = variable.name.replace(":", "_")
                    if "gradients" in summaries:
                        summary.histogram("gradients/%s" % var_name,
                                          grad_values)
                    if "gradient_norm" in summaries:
                        summary.scalar("gradient_norm/%s" % var_name,
                                       clip_ops.global_norm([grad_values]))

            if clip_gradients is not None and (
                    "global_gradient_norm" in summaries
                    or "gradient_norm" in summaries):
                summary.scalar("global_norm/clipped_gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))
        else:
            loss = losses[0]

            ## similar but will do do clip gradient and other things, if comment auc 0.72739 not comment 0.72634
            #return opt.minimize(losses[0], global_step=global_step if increment_global_step else None)

            # All trainable variables, if specific variables are not specified.
            if variables is None:
                variables = vars_.trainable_variables()

            # Compute gradients.
            try:
                gradients = opt.compute_gradients(
                    loss,
                    variables,
                    colocate_gradients_with_ops=colocate_gradients_with_ops)
            except Exception:
                # TODO not work for keras
                gradients = opt.get_updates(loss=loss, params=variables)

            # Optionally add gradient noise.
            if gradient_noise_scale is not None:
                gradients = _add_scaled_noise_to_gradients(
                    gradients, gradient_noise_scale)

            # Multiply some gradients.
            if gradient_multipliers is not None:
                gradients = _multiply_gradients(gradients,
                                                gradient_multipliers)
                if not gradients:
                    raise ValueError(
                        "Empty list of (gradient, var) pairs encountered. This is most "
                        "likely to be caused by an improper value of gradient_multipliers."
                    )

            if "global_gradient_norm" in summaries or "gradient_norm" in summaries:
                summary.scalar("global_norm/gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

            # Optionally clip gradients by global norm.
            if isinstance(clip_gradients, float):
                gradients = _clip_gradients_by_norm(gradients, clip_gradients)
            elif callable(clip_gradients):
                gradients = clip_gradients(gradients)
            elif clip_gradients is not None:
                raise ValueError("Unknown type %s for clip_gradients" %
                                 type(clip_gradients))

            # # Add scalar summary for loss.
            # if "loss" in summaries:
            #   summary.scalar("loss", loss)

            # Add histograms for variables, gradients and gradient norms.
            for gradient, variable in gradients:
                if isinstance(gradient, ops.IndexedSlices):
                    grad_values = gradient.values
                else:
                    grad_values = gradient

                if grad_values is not None:
                    var_name = variable.name.replace(":", "_")
                    if "gradients" in summaries:
                        summary.histogram("gradients/%s" % var_name,
                                          grad_values)
                    if "gradient_norm" in summaries:
                        summary.scalar("gradient_norm/%s" % var_name,
                                       clip_ops.global_norm([grad_values]))

            if clip_gradients is not None and (
                    "global_gradient_norm" in summaries
                    or "gradient_norm" in summaries):
                summary.scalar("global_norm/clipped_gradient_norm",
                               clip_ops.global_norm(list(zip(*gradients))[0]))

        # Create gradient updates.
        grad_updates = opt.apply_gradients(
            gradients,
            global_step=global_step if increment_global_step else None,
            name="train")

        # IMPORTANT this is needed for momentum!
        if update_ops:
            grad_updates = [grad_updates]
            #print('--------------------1', grad_updates)
            grad_updates.extend(update_ops)
            #print('-------------------2', update_ops)
            grad_updates = tf.group(*grad_updates)
            #print('-----------grad updates', grad_updates)

        # # Make sure total_loss is valid.
        # final_loss = array_ops.check_numerics(loss, "Loss is inf or nan")

        # # Ensure the train_tensor computes grad_updates.
        # train_tensor = control_flow_ops.with_dependencies([grad_updates], final_loss)

        #return train_tensor
        return grad_updates
Example #38
0
    def set_model(self, model):
        """Sets Keras model and creates summary ops."""

        self.model = model
        self.sess = K.get_session()
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf_summary.histogram(mapped_weight_name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = array_ops.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = array_ops.transpose(w_img,
                                                            perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = array_ops.reshape(w_img,
                                                      [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf_summary.image(mapped_weight_name, w_img)

                if self.write_grads:
                    for weight in layer.trainable_weights:
                        mapped_weight_name = weight.name.replace(':', '_')
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf_summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)

                if hasattr(layer, 'output'):
                    tf_summary.histogram('{}_out'.format(layer.name),
                                         layer.output)
        self.merged = tf_summary.merge_all()

        if self.write_graph:
            self.writer = self._writer_class(self.log_dir, self.sess.graph)
        else:
            self.writer = self._writer_class(self.log_dir)
Example #39
0
def logistic_regression(x,
                        y,
                        class_weight=None,
                        init_mean=None,
                        init_stddev=1.0):
  """Creates logistic regression TensorFlow subgraph.

  Args:
    x: tensor or placeholder for input features,
       shape should be [batch_size, n_features].
    y: tensor or placeholder for labels (one-hot),
       shape should be [batch_size, n_classes].
    class_weight: tensor, [n_classes], where for each class
                  it has weight of the class. If not provided
                  will check if graph contains tensor `class_weight:0`.
                  If that is not provided either all ones are used.
    init_mean: the mean value to use for initialization.
    init_stddev: the standard devation to use for initialization.

  Returns:
    Predictions and loss tensors.

  Side effects:
    The variables linear_regression.weights and linear_regression.bias are
    initialized as follows.  If init_mean is not None, then initialization
    will be done using a random normal initializer with the given init_mean
    and init_stddv.  (These may be set to 0.0 each if a zero initialization
    is desirable for convex use cases.)  If init_mean is None, then the
    uniform_unit_scaling_initialzer will be used.
  """
  with vs.variable_scope('logistic_regression'):
    scope_name = vs.get_variable_scope().name
    summary.histogram('%s.x' % scope_name, x)
    summary.histogram('%s.y' % scope_name, y)
    dtype = x.dtype.base_dtype
    # Set up the requested initialization.
    if init_mean is None:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], y.get_shape()[-1]], dtype=dtype)
      bias = vs.get_variable('bias', [y.get_shape()[-1]], dtype=dtype)
    else:
      weights = vs.get_variable(
          'weights', [x.get_shape()[1], y.get_shape()[-1]],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
      bias = vs.get_variable(
          'bias', [y.get_shape()[-1]],
          initializer=init_ops.random_normal_initializer(
              init_mean, init_stddev, dtype=dtype),
          dtype=dtype)
    summary.histogram('%s.weights' % scope_name, weights)
    summary.histogram('%s.bias' % scope_name, bias)
    # If no class weight provided, try to retrieve one from pre-defined
    # tensor name in the graph.
    if not class_weight:
      try:
        class_weight = ops.get_default_graph().get_tensor_by_name(
            'class_weight:0')
      except KeyError:
        pass

    return losses_ops.softmax_classifier(
        x, y, weights, bias, class_weight=class_weight)
Example #40
0
  def set_model(self, model):
    self.model = model
    self.sess = K.get_session()
    if self.histogram_freq and self.merged is None:
      for layer in self.model.layers:

        for weight in layer.weights:
          tf_summary.histogram(weight.name, weight)
          if self.write_images:
            w_img = array_ops.squeeze(weight)
            shape = w_img.get_shape()
            if len(shape) > 1 and shape[0] > shape[1]:
              w_img = array_ops.transpose(w_img)
            if len(shape) == 1:
              w_img = array_ops.expand_dims(w_img, 0)
            w_img = array_ops.expand_dims(array_ops.expand_dims(w_img, 0), -1)
            tf_summary.image(weight.name, w_img)

        if hasattr(layer, 'output'):
          tf_summary.histogram('{}_out'.format(layer.name), layer.output)
    self.merged = tf_summary.merge_all()

    if self.write_graph:
      self.writer = tf_summary.FileWriter(self.log_dir, self.sess.graph)
    else:
      self.writer = tf_summary.FileWriter(self.log_dir)

    if self.embeddings_freq:
      self.saver = saver_lib.Saver()

      embeddings_layer_names = self.embeddings_layer_names

      if not embeddings_layer_names:
        embeddings_layer_names = [
            layer.name for layer in self.model.layers
            if type(layer).__name__ == 'Embedding'
        ]

      embeddings = {
          layer.name: layer.weights[0]
          for layer in self.model.layers if layer.name in embeddings_layer_names
      }

      embeddings_metadata = {}

      if not isinstance(self.embeddings_metadata, str):
        embeddings_metadata = self.embeddings_metadata
      else:
        embeddings_metadata = {
            layer_name: self.embeddings_metadata
            for layer_name in embeddings.keys()
        }

      config = projector.ProjectorConfig()
      self.embeddings_logs = []

      for layer_name, tensor in embeddings.items():
        embedding = config.embeddings.add()
        embedding.tensor_name = tensor.name

        self.embeddings_logs.append(
            os.path.join(self.log_dir, layer_name + '.ckpt'))

        if layer_name in embeddings_metadata:
          embedding.metadata_path = embeddings_metadata[layer_name]

      projector.visualize_embeddings(self.writer, config)
 def testHistogramSummaryTypes(self):
     with ops.Graph().as_default():
         for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16,
                       dtypes.int32, dtypes.float32, dtypes.float64):
             const = constant_op.constant(10, dtype=dtype)
             summary.histogram("h", const)
Example #42
0
def logistic_regression(x,
                        y,
                        class_weight=None,
                        init_mean=None,
                        init_stddev=1.0):
    """Creates logistic regression TensorFlow subgraph.

  Args:
    x: tensor or placeholder for input features,
       shape should be [batch_size, n_features].
    y: tensor or placeholder for labels (one-hot),
       shape should be [batch_size, n_classes].
    class_weight: tensor, [n_classes], where for each class
                  it has weight of the class. If not provided
                  will check if graph contains tensor `class_weight:0`.
                  If that is not provided either all ones are used.
    init_mean: the mean value to use for initialization.
    init_stddev: the standard deviation to use for initialization.

  Returns:
    Predictions and loss tensors.

  Side effects:
    The variables linear_regression.weights and linear_regression.bias are
    initialized as follows.  If init_mean is not None, then initialization
    will be done using a random normal initializer with the given init_mean
    and init_stddv.  (These may be set to 0.0 each if a zero initialization
    is desirable for convex use cases.)  If init_mean is None, then the
    uniform_unit_scaling_initialzer will be used.
  """
    with vs.variable_scope('logistic_regression'):
        scope_name = vs.get_variable_scope().name
        summary.histogram('%s.x' % scope_name, x)
        summary.histogram('%s.y' % scope_name, y)
        dtype = x.dtype.base_dtype
        # Set up the requested initialization.
        if init_mean is None:
            weights = vs.get_variable(
                'weights',
                [x.get_shape()[1], y.get_shape()[-1]], dtype=dtype)
            bias = vs.get_variable('bias', [y.get_shape()[-1]], dtype=dtype)
        else:
            weights = vs.get_variable(
                'weights',
                [x.get_shape()[1], y.get_shape()[-1]],
                initializer=init_ops.random_normal_initializer(init_mean,
                                                               init_stddev,
                                                               dtype=dtype),
                dtype=dtype)
            bias = vs.get_variable(
                'bias', [y.get_shape()[-1]],
                initializer=init_ops.random_normal_initializer(init_mean,
                                                               init_stddev,
                                                               dtype=dtype),
                dtype=dtype)
        summary.histogram('%s.weights' % scope_name, weights)
        summary.histogram('%s.bias' % scope_name, bias)
        # If no class weight provided, try to retrieve one from pre-defined
        # tensor name in the graph.
        if not class_weight:
            try:
                class_weight = ops.get_default_graph().get_tensor_by_name(
                    'class_weight:0')
            except KeyError:
                pass

        return losses_ops.softmax_classifier(x,
                                             y,
                                             weights,
                                             bias,
                                             class_weight=class_weight)
def _add_layer_summary(value, tag):
  summary.scalar("%s/fraction_of_zero_values" % tag, nn.zero_fraction(value))
  summary.histogram("%s/activation" % tag, value)
Example #44
0
    def _make_histogram_ops(self, model):
        """Defines histogram ops when histogram_freq > 0."""
        # only make histogram summary op if it hasn't already been made
        if self.histogram_freq and self.merged is None:
            for layer in self.model.layers:
                for weight in layer.weights:
                    mapped_weight_name = weight.name.replace(':', '_')
                    tf_summary.histogram(mapped_weight_name, weight)
                    if self.write_images:
                        w_img = array_ops.squeeze(weight)
                        shape = K.int_shape(w_img)
                        if len(shape) == 2:  # dense layer kernel case
                            if shape[0] > shape[1]:
                                w_img = array_ops.transpose(w_img)
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [1, shape[0], shape[1], 1])
                        elif len(shape) == 3:  # convnet case
                            if K.image_data_format() == 'channels_last':
                                # switch to channels_first to display
                                # every kernel as a separate image
                                w_img = array_ops.transpose(w_img,
                                                            perm=[2, 0, 1])
                                shape = K.int_shape(w_img)
                            w_img = array_ops.reshape(
                                w_img, [shape[0], shape[1], shape[2], 1])
                        elif len(shape) == 1:  # bias case
                            w_img = array_ops.reshape(w_img,
                                                      [1, shape[0], 1, 1])
                        else:
                            # not possible to handle 3D convnets etc.
                            continue

                        shape = K.int_shape(w_img)
                        assert len(shape) == 4 and shape[-1] in [1, 3, 4]
                        tf_summary.image(mapped_weight_name, w_img)

                if self.write_grads:
                    for weight in layer.trainable_weights:
                        mapped_weight_name = weight.name.replace(':', '_')
                        grads = model.optimizer.get_gradients(
                            model.total_loss, weight)

                        def is_indexed_slices(grad):
                            return type(grad).__name__ == 'IndexedSlices'

                        grads = [
                            grad.values if is_indexed_slices(grad) else grad
                            for grad in grads
                        ]
                        tf_summary.histogram(
                            '{}_grad'.format(mapped_weight_name), grads)

                if hasattr(layer, 'output'):
                    if isinstance(layer.output, list):
                        for i, output in enumerate(layer.output):
                            tf_summary.histogram(
                                '{}_out_{}'.format(layer.name, i), output)
                    else:
                        tf_summary.histogram('{}_out'.format(layer.name),
                                             layer.output)