def _maybe_make_cross_shard_optimizer(opt):
    if callable(opt):
        if not isinstance(opt(), tpu_optimizer.CrossShardOptimizer):
            return lambda: tpu_optimizer.CrossShardOptimizer(opt())
    elif not isinstance(opt, tpu_optimizer.CrossShardOptimizer):
        return tpu_optimizer.CrossShardOptimizer(opt)
    return opt
Esempio n. 2
0
def model_fn(features, labels, mode, params):
    """TPUEstimatorSpec for the Squeezenet model."""
    ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    logits = squeezenet(features,
                        is_training=is_training,
                        num_classes=params["num_classes"])

    loss = tf.reduce_mean(
        tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels))

    global_batch_size = params["num_shards"] * params["batch_size"]
    decay_steps = 1300 * 1000 * params["num_epochs"] // global_batch_size
    learning_rate = tf.train.polynomial_decay(
        params["lr"],
        global_step=tf.train.get_or_create_global_step(),
        end_learning_rate=params["min_lr"],
        decay_steps=decay_steps,
        power=1.0,
        cycle=False)

    # TODO(power): Hack copied from resnet: remove when summaries are working.
    lr_repeat = tf.reshape(
        tf.tile(tf.expand_dims(learning_rate, 0), [
            params["batch_size"],
        ]), [params["batch_size"], 1])

    if params["optimizer"] == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    elif params["optimizer"] == "rmsprop":
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              momentum=params["momentum"],
                                              epsilon=1.0)
    else:
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=params["momentum"],
                                               use_nesterov=True)

    if params["use_tpu"]:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, tf.train.get_global_step())

    param_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=ProfileOptionBuilder.trainable_variables_parameter())
    fl_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=tf.profiler.ProfileOptionBuilder.float_operation())

    return tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        eval_metrics=(metric_fn, [labels, logits, lr_repeat]),
        predictions={
            "classes": tf.argmax(input=logits, axis=1),
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        },
    )
Esempio n. 3
0
def model_fn(features, labels, mode, params):
    del params

    #with G.as_default():
    transfer = tf.nn.softplus

    hidden = tf.layers.dense(inputs=features, units=200, activation=transfer)
    reconstruction = tf.layers.dense(inputs=hidden, units=784)

    # for an autoencoder, the cost/loss is not just part of training
    loss_op = 0.5 * tf.reduce_sum(
        tf.pow(tf.subtract(reconstruction, labels), 2.0))

    learning_rate = tf.train.exponential_decay(FLAGS.learning_rate,
                                               tf.train.get_global_step(),
                                               100000, 0.96)
    if FLAGS.use_tpu:
        opt = tpu_optimizer.CrossShardOptimizer(
            tf.train.GradientDescentOptimizer(learning_rate=learning_rate))
    else:
        opt = tf.train.AdamOptimizer()
    train_op = opt.minimize(loss_op, global_step=tf.train.get_global_step())
    #return tf.estimator.EstimatorSpec(mode=mode, loss=loss_op, train_op=train_op)
    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss_op,
                                          train_op=train_op)
Esempio n. 4
0
def model_fn(features, labels, mode, params):

	_optimizer_fn = train_utils.modelParams.optimizer_fn(train_utils.optimizer, FLAGS.learning_rate)

	if FLAGS.use_tpu:
		_optimizer_fn = tpu_optimizer.CrossShardOptimizer(_optimizer_fn)

	_loss_fn = train_utils.modelParams.loss_fn(train_utils.loss_fn, FLAGS.loss_fn)

	_model_graph = train_utils.modelParams.get_model(models, FLAGS.training_model)

	if mode == tf.estimator.ModeKeys.TRAIN:
		_logits = _model_graph(features, FLAGS.num_classes)

		_loss = _loss_fn(_logits, labels)

		_train_op = _optimizer_fn.minimize(loss, global_step=tf.train.get_global_step())

		return tpu_estimator.TPUEstimatorSpec(
			mode=mode,
      		loss=_loss,
      		train_op=_train_op,
      		predictions={
      			"class": tf.argmax(_logits, axis = -1)
      			"probabilities": _logits
      		}
      		)
def model_fn(features, labels, mode, params):
    del params  # unused
    num_classes = 8

    x = Input(tensor=features)
    #x = InputLayer(input_shape=(img_size_flat,))
    #x = Reshape(img_shape)(x)

    # #model.add(Dropout(0.5, input_shape=(48, 48, 1)))
    x = Conv2D(kernel_size=5, strides=1, filters=32, padding='same',
                activation='relu')(x)
    x = Conv2D(kernel_size=5, strides=1, filters=32, padding='same',
              activation='relu')(x)
    x = MaxPooling2D(pool_size=2, strides=2)(x)
# 
    x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same',
                activation='relu')(x)
    x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same',
                activation='relu')(x)
    x = Conv2D(kernel_size=10, strides=1, filters=64, padding='same',
                activation='relu')(x)
    x = MaxPooling2D(pool_size=2, strides=2)(x)

    x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same',
              activation='relu')(x)
    x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same',
                activation='relu')(x)
    x = Conv2D(kernel_size=15, strides=1, filters=128, padding='same',
                activation='relu')(x)
    x = MaxPooling2D(pool_size=2, strides=2)(x)

    x = Flatten()(x)

    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    # # Last fully-connected / dense layer with softmax-activation
    # # for use in classification.
    logits = Dense(num_classes)(x)

    loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=labels
                )
            )
    optimizer = tf.train.AdamOptimizer(learning_rate = 1e-4)

    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions={
            "classes": tf.argmax(input=logits, axis=1),
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
            }
        )
Esempio n. 6
0
def model_fn(features, labels, mode, params):
    """Define a CIFAR model in Keras."""
    del params  # unused
    #layers = tf.keras.layers
    #models = tf.keras.models

    # Pass our input tensor to initialize the Keras input layer.
    #mdl = layers.Input(tensor=features)
    #mdl = layers.Dense(2048, activation="relu")(mdl)
    #op = layers.Dense(3862, activation="softmax")(features)

    op = tf.contrib.layers.fully_connected(inputs=features,
                                           num_outputs=3862,
                                           activation_fn=tf.nn.softmax)

    # Instead of constructing a Keras model for training, build our loss function
    # and optimizer in Tensorflow.
    #
    # N.B.  This construction omits some features that are important for more
    # complex models (e.g. regularization, batch-norm).  Once
    # `model_to_estimator` support is added for TPUs, it should be used instead.
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=op,
                                                   labels=tf.cast(
                                                       labels, tf.float32)))
    optimizer = tf.train.AdamOptimizer(0.01)
    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          predictions={"probabilities": op})
def train(features, labels, hparams, get_features_fn, embedding_fn,
          embedding_weights_initializer, bias_weights_initializer,
          global_rating_bias_initializer):
    """Constructs the matrix factorization model training graph."""
    (query_movie_ids, query_movie_ratings, query_genre_ids, query_genre_freqs,
     query_genre_ratings, candidate_movie_id,
     candidate_genre_id) = (get_features_fn(features))

    model_scores, _, _ = movie_candidate_score(
        query_movie_ids, query_movie_ratings, query_genre_ids,
        query_genre_freqs, query_genre_ratings, candidate_movie_id,
        candidate_genre_id, hparams, embedding_fn,
        embedding_weights_initializer, bias_weights_initializer,
        global_rating_bias_initializer)

    loss = tf.losses.mean_squared_error(features[LABEL_RATING_SCORE],
                                        model_scores)

    optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[hparams.optimizer](
        learning_rate=hparams.learning_rate)
    if hparams.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        summaries=[],
        global_step=tf.contrib.framework.get_global_step(),
        optimizer=optimizer,
        learning_rate=None)

    return EstimatorSpec(mode=TRAIN,
                         predictions=model_scores,
                         loss=loss,
                         train_op=train_op)
Esempio n. 8
0
def model_fn(features, labels, mode, params):
  """TPUEstimator model_fn for MobileNet."""
  logits = predict_fn(features, mode, params)

  loss = tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits, labels=labels))

  # decay once per epoch
  steps_per_epoch = params["num_batches_per_epoch"]

  if params["decay_mode"] == "piecewise":
    learning_rate = params["learning_rate"] * tf.train.piecewise_constant(
        tf.train.get_or_create_global_step(),
        [steps_per_epoch * 30, steps_per_epoch * 60], [1.0, 0.1, 0.01])
  else:
    learning_rate = tf.train.exponential_decay(
        params["learning_rate"],
        tf.train.get_or_create_global_step(),
        decay_rate=params["learning_rate_decay"],
        decay_steps=steps_per_epoch,
    )

  if params["optimizer"] == "rmsprop":
    optimizer = tf.train.RMSPropOptimizer(
        learning_rate=learning_rate,
        momentum=params["momentum"],
        epsilon=params["rmsprop_epsilon"],
        decay=params["rmsprop_decay"],
    )
  else:
    optimizer = tf.train.MomentumOptimizer(
        learning_rate=learning_rate,
        momentum=params["momentum"],
        use_nesterov=True)

  if params["use_tpu"]:
    optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

  # Batch norm requires update_ops to be added as a train_op dependency.
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    train_op = optimizer.minimize(loss, tf.train.get_global_step())

  # TODO(power): Hack copied from resnet: remove when summaries are working.
  lr_repeat = tf.reshape(
      tf.tile(tf.expand_dims(learning_rate, 0), [
          params["batch_size"],
      ]), [params["batch_size"], 1])

  return tpu_estimator.TPUEstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op,
      predictions={
          "classes": tf.argmax(input=logits, axis=1),
          "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
      },
      eval_metrics=(metric_fn, [labels, logits, lr_repeat]),
  )
Esempio n. 9
0
def model_fn(features, labels, mode, params):
    output_size = params['output_size']
    net = features

    if FLAGS.data_type == 'float32':
        network = resnet_model.resnet_v1(resnet_layers,
                                         block_fn,
                                         num_classes=output_size,
                                         data_format='channels_last',
                                         filters=filters)

        net = network(inputs=features, is_training=True)
    else:
        with tf.variable_scope('cg', custom_getter=get_custom_getter()):
            network = resnet_model.resnet_v1(resnet_layers,
                                             block_fn,
                                             num_classes=output_size,
                                             data_format='channels_last',
                                             filters=filters)

            net = network(inputs=features, is_training=True)
            net = tf.cast(net, tf.float32)

    onehot_labels = tf.one_hot(labels, output_size)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels,
                                           logits=net)

    learning_rate = tf.train.exponential_decay(0.1, tf.train.get_global_step(),
                                               25000, 0.97)
    if opt == 'sgd':
        tf.logging.info('Using SGD optimizer')
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)
    elif opt == 'momentum':
        tf.logging.info('Using Momentum optimizer')
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=0.9)
    elif opt == 'rms':
        tf.logging.info('Using RMS optimizer')
        optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                              RMSPROP_DECAY,
                                              momentum=RMSPROP_MOMENTUM,
                                              epsilon=RMSPROP_EPSILON)
    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    param_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=ProfileOptionBuilder.trainable_variables_parameter())
    fl_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=tf.profiler.ProfileOptionBuilder.float_operation())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
Esempio n. 10
0
def _replicated_optimizer(opt):
    """Wrap the optimizer `opt` with CrossShardOptimizer if applicable."""
    if tpu_function.get_tpu_context().number_of_shards == 1:
        return opt

    if isinstance(opt, keras_optimizers.TFOptimizer):
        return tpu_optimizer.CrossShardOptimizer(opt.optimizer)
    else:
        return KerasCrossShardOptimizer(opt)
Esempio n. 11
0
def model_fn(features, labels, mode, params):
  """A simple CNN."""
  del params

  if mode != tf.estimator.ModeKeys.TRAIN:
    raise RuntimeError("mode {} is not supported yet".format(mode))

  conv1 = tf.layers.conv2d(
      inputs=features,
      filters=32,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)
  pool1 = tf.layers.max_pooling2d(
      inputs=conv1,
      pool_size=[2, 2],
      strides=2,
      padding="same")
  conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=64,
      kernel_size=[5, 5],
      padding="same",
      activation=tf.nn.relu)
  pool2 = tf.layers.max_pooling2d(
      inputs=conv2,
      pool_size=[2, 2],
      strides=2,
      padding="same")
  pool2_flat = tf.reshape(pool2, [-1, 8 * 8 * 64])

  dense = tf.layers.dense(
      inputs=pool2_flat,
      units=384,
      activation=tf.nn.relu)
  dropout = tf.layers.dropout(
      inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
  logits = tf.layers.dense(inputs=dropout, units=10)

  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
  loss = tf.losses.softmax_cross_entropy(
      onehot_labels=onehot_labels, logits=logits)

  learning_rate = tf.train.exponential_decay(
      FLAGS.learning_rate, tf.train.get_global_step(), 25000, 0.96)
  if FLAGS.use_tpu:
    optimizer = tpu_optimizer.CrossShardOptimizer(
        tf.train.GradientDescentOptimizer(learning_rate=learning_rate))
  else:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

  train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

  return tpu_estimator.TPUEstimatorSpec(
      mode=mode,
      loss=loss,
      train_op=train_op)
    def get_train_op(self, loss):
        """Creates a training op.

    Args:
      loss: A float32 `Tensor` representing the total training loss.
    Returns:
      train_op: A slim.learning.create_train_op train_op.
    Raises:
      ValueError: If specified optimizer isn't supported.
    """
        # Get variables to train (defined in subclass).
        assert self.variables_to_train

        # Define a learning rate schedule.
        decay_steps = self._config.learning.decay_steps
        decay_factor = self._config.learning.decay_factor
        learning_rate = float(self._config.learning.learning_rate)

        # Define a learning rate schedule.
        global_step = slim.get_or_create_global_step()
        learning_rate = tf.train.exponential_decay(learning_rate,
                                                   global_step,
                                                   decay_steps,
                                                   decay_factor,
                                                   staircase=True)

        # Create an optimizer.
        opt_type = self._config.learning.optimizer
        if opt_type == 'adam':
            opt = tf.train.AdamOptimizer(learning_rate)
        elif opt_type == 'momentum':
            opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
        elif opt_type == 'rmsprop':
            opt = tf.train.RMSPropOptimizer(learning_rate,
                                            momentum=0.9,
                                            epsilon=1.0,
                                            decay=0.9)
        else:
            raise ValueError('Unsupported optimizer %s' % opt_type)

        if self._config.use_tpu:
            opt = tpu_optimizer.CrossShardOptimizer(opt)

        # Create a training op.
        # train_op = opt.minimize(loss, var_list=self.variables_to_train)
        # Create a training op.
        train_op = slim.learning.create_train_op(
            loss,
            optimizer=opt,
            variables_to_train=self.variables_to_train,
            update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS))

        return train_op
Esempio n. 13
0
def model_fn(features, labels, mode, params):
    """Inception v3 model using Estimator API."""
    del params

    if mode != tf.estimator.ModeKeys.TRAIN:
        raise RuntimeError('mode {} is not supported yet'.format(mode))

    num_labels = FLAGS.num_labels

    with slim.arg_scope(inception_v3_arg_scope(is_training=True)):
        logits, end_points = inception.inception_v3(
            features,
            num_labels,
            is_training=True,
            depth_multiplier=FLAGS.depth_multiplier)

    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32),
                               depth=num_labels)

    if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(end_points['AuxLogits'],
                                        onehot_labels,
                                        label_smoothing=0.1,
                                        weights=0.4,
                                        scope='aux_loss')
    tf.losses.softmax_cross_entropy(logits,
                                    onehot_labels,
                                    label_smoothing=0.1,
                                    weights=1.0)
    loss = tf.losses.get_total_loss()

    if FLAGS.optimizer == 'sgd':
        tf.logging.info('Using SGD optimizer')
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=FLAGS.learning_rate)
    elif FLAGS.optimizer == 'momentum':
        tf.logging.info('Using Momentum optimizer')
        optimizer = tf.train.MomentumOptimizer(
            learning_rate=FLAGS.learning_rate, momentum=0.9)
    else:
        tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(
        loss, global_step=tf.train.get_or_create_global_step())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
Esempio n. 14
0
def model_fn(features, labels, mode, params):
    """Define a CIFAR model in Keras."""
    del params  # unused
    layers = tf.contrib.keras.layers

    # Pass our input tensor to initialize the Keras input layer.
    v = layers.Input(tensor=features)
    v = layers.Conv2D(filters=32,
                      kernel_size=5,
                      activation="relu",
                      padding="same")(v)
    v = layers.MaxPool2D(pool_size=2)(v)
    v = layers.Conv2D(filters=64,
                      kernel_size=5,
                      activation="relu",
                      padding="same")(v)
    v = layers.MaxPool2D(pool_size=2)(v)
    v = layers.Flatten()(v)
    fc1 = layers.Dense(units=512, activation="relu")(v)
    logits = layers.Dense(units=10)(fc1)

    # Instead of constructing a Keras model for training, build our loss function
    # and optimizer in Tensorflow.
    #
    # N.B.  This construction omits some features that are important for more
    # complex models (e.g. regularization, batch-norm).  Once
    # `model_to_estimator` support is added for TPUs, it should be used instead.
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=labels))
    optimizer = tf.train.AdamOptimizer()
    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          predictions={
                                              "classes":
                                              tf.argmax(input=logits, axis=1),
                                              "probabilities":
                                              tf.nn.softmax(
                                                  logits,
                                                  name="softmax_tensor")
                                          })
    def _build_train_op(self, tpu_opt):
        """Build training specific ops for the graph."""
        self.lrn_rate = tf.constant(self.hps.lrn_rate, tf.float32)
        """ Comment out for TPU execution """
        #tf.summary.scalar('learning_rate', self.lrn_rate)
        """ use tf.case() instead of SessionRunHook for changing learn rate (for TPU execution) """
        def r1():
            return tf.constant(0.1)

        def r2():
            return tf.constant(0.01)

        def r3():
            return tf.constant(0.001)

        def r4():
            return tf.constant(0.0001)

        self.lrn_rate = tf.case(
            {
                tf.less(self.global_step, 40000): r1,
                tf.less(self.global_step, 60000): r2,
                tf.less(self.global_step, 80000): r3
            },
            default=r4,
            exclusive=False)

        trainable_variables = tf.trainable_variables()
        grads = tf.gradients(self.cost, trainable_variables)

        if self.hps.optimizer == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(self.lrn_rate)
        elif self.hps.optimizer == 'mom':
            optimizer = tf.train.MomentumOptimizer(self.lrn_rate, 0.9)
        # Added for TPU cross shared optimizer for replicas
        if tpu_opt:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        apply_op = optimizer.apply_gradients(zip(grads, trainable_variables),
                                             global_step=self.global_step,
                                             name='train_step')

        train_ops = [apply_op] + self._extra_train_ops
        self.train_op = tf.group(*train_ops)
Esempio n. 16
0
def model_fn(features, labels, mode, params):
    """Define a Densenet model."""
    logits = densenet_model.densenet_cifar_model(
        features,
        params["growth_rate"],
        params["layers"],
        is_training=(mode == tf.estimator.ModeKeys.TRAIN),
        num_blocks=params["blocks"])

    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=labels))

    learning_rate = tf.train.exponential_decay(
        0.00001,
        tf.train.get_or_create_global_step(),
        decay_steps=200,
        decay_rate=0.5)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=0.9,
                                           use_nesterov=True)

    # N.B. We have to set this parameter manually below.
    if params["use_tpu"]:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

    # Batch norm requires update_ops to be added as a train_op dependency.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(loss, tf.train.get_global_step())

    return tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op,
        predictions={
            "classes": tf.argmax(input=logits, axis=1),
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
        },
        eval_metrics=(metric_fn, [labels, logits]),
    )
Esempio n. 17
0
    def _model_fn_train(self, mode, total_loss, batches_per_epoch,
                        num_epochs_per_decay, initial_learning_rate,
                        learning_rate_decay_factor, rmsprop_decay,
                        rmsprop_momentum, rmsprop_epsilon,
                        moving_average_decay):
        """This is the TRAIN part of model_fn."""
        if mode != tf.estimator.ModeKeys.TRAIN:
            return None
        # Configure the learning rate using an exponetial decay.
        global_step = tf.train.get_or_create_global_step()
        decay_steps = int(1.0 * batches_per_epoch * num_epochs_per_decay)
        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=decay_steps,
            decay_rate=learning_rate_decay_factor,
            staircase=True)
        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   0.0001 * initial_learning_rate,
                                   name='learning_rate')
        optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                              rmsprop_decay,
                                              momentum=rmsprop_momentum,
                                              epsilon=rmsprop_epsilon)
        if self.use_tpu:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(total_loss, global_step=global_step)

        # NB. In the inception code this was "tf.trainable_variables()
        # + tf.moving_average_variables()", but we've settled on just
        # tf.model_variables() in the existing production DV2.
        variables_to_average = tf.model_variables()
        variable_averages = tf.train.ExponentialMovingAverage(
            decay=moving_average_decay, num_updates=global_step)
        with tf.control_dependencies([train_op
                                      ]), tf.name_scope('moving_average'):
            train_op = variable_averages.apply(variables_to_average)
        tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, train_op)
        return train_op
Esempio n. 18
0
def train(features, labels, hparams, embedding_weights_initializer,
          class_weights_initializer, class_biases_initializer, get_features_fn,
          embedding_fn):
    """Constructs the training graph."""
    (movie_ids_ratings, genre_ids_freqs,
     genre_ids_ratings) = (get_features_fn(features))

    query_embeddings = embed_query_features(movie_ids_ratings, genre_ids_freqs,
                                            genre_ids_ratings, hparams, TRAIN,
                                            embedding_weights_initializer,
                                            embedding_fn)

    class_weights, class_biases = class_weights_biases(
        hparams, class_weights_initializer, class_biases_initializer)

    scores = tf.matmul(query_embeddings,
                       tf.transpose(class_weights)) + class_biases

    target_one_hot = tf.one_hot(indices=features['candidate_movie_id_values'],
                                depth=MOVIE_VOCAB_SIZE,
                                on_value=1.0)

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=target_one_hot,
                                                logits=scores))

    optimizer = tf.contrib.layers.OPTIMIZER_CLS_NAMES[hparams.optimizer](
        learning_rate=hparams.learning_rate)
    if hparams.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        summaries=[],
        global_step=tf.contrib.framework.get_global_step(),
        optimizer=optimizer,
        learning_rate=None)

    return EstimatorSpec(mode=TRAIN,
                         predictions=scores,
                         loss=loss,
                         train_op=train_op)
Esempio n. 19
0
def model_fn(features, labels, mode, params):
    output_dim = params['output_dim']
    net = features

    shp = net.get_shape().as_list()

    flattened_shape = shp[1] * shp[2] * shp[3]

    net = tf.reshape(net, [shp[0], flattened_shape])

    net = tf.layers.dense(inputs=net, units=4, activation=tf.nn.relu)

    net = tf.layers.dropout(inputs=net, rate=0.5)

    net = tf.layers.dense(inputs=net, units=output_dim, activation=None)

    loss = tf.losses.softmax_cross_entropy(onehot_labels=labels, logits=net)

    learning_rate = tf.train.exponential_decay(0.01,
                                               tf.train.get_global_step(),
                                               25000, 0.97)
    if FLAGS.use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(
            tf.train.GradientDescentOptimizer(learning_rate=learning_rate))
    else:
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=learning_rate)

    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

    param_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=ProfileOptionBuilder.trainable_variables_parameter())
    fl_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=tf.profiler.ProfileOptionBuilder.float_operation())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)
Esempio n. 20
0
def tpu_resnet_model_fn(features, labels, mode, params):
    """Our model_fn for ResNet to be used with our TPUEstimator."""
    del params

    model_result = resnet_model_common(features, labels, mode)

    def metric_fn(labels, logits):
        accuracy = tf.metrics.accuracy(tf.argmax(input=labels, axis=1),
                                       tf.argmax(input=logits, axis=1))
        return {'accuracy': accuracy}

    optimizer = tpu_optimizer.CrossShardOptimizer(model_result.optimizer)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(model_result.loss,
                                      global_step=tf.train.get_global_step())
    return tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        loss=model_result.loss,
        predictions=model_result.predictions,
        train_op=train_op,
        eval_metrics=(metric_fn, [labels, model_result.logits]))
Esempio n. 21
0
    def _build_optimizer(self, learning_rate):
        """Build optimizer."""
        if self.hparams.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif self.hparams.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=learning_rate,
                momentum=self.hparams.momentum_rate)
        elif self.hparams.optimizer == 'rmsprop':
            tf.logging.info('Using RMSProp optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', self.hparams.optimizer)

        if self.hparams.use_tpu:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
        return optimizer
Esempio n. 22
0
def char_rnn_model(features, labels, mode, params):
  """Character level recurrent neural network model to predict classes."""
  byte_vectors = tf.one_hot(features[CHARS_FEATURE], 256, 1., 0.)
  byte_list = tf.unstack(byte_vectors, axis=1)

  cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
  _, encoding = tf.nn.static_rnn(cell, byte_list, dtype=tf.float32)

  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)

  predicted_classes = tf.argmax(logits, 1)
  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions={
            'class': predicted_classes,
            'prob': tf.nn.softmax(logits)
        })

  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
  if mode == tf.estimator.ModeKeys.TRAIN:
    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
    if FLAGS.use_tpu:
      # When using TPU, wrap the optimizer with CrossShardOptimizer which
      # handles synchronization details between different TPU cores. To the
      # user, this should look like regular synchronous training.
      optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

  eval_metric_ops = {
      'accuracy': tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)
  }
  return tf.estimator.EstimatorSpec(
      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
Esempio n. 23
0
def model_fn(features, labels, mode, params):
    """Our model_fn for Densenet to be used with our Estimator."""
    tf.logging.info("model_fn")

    with tf.variable_scope('cg', custom_getter=get_custom_getter()):
        if FLAGS.network_depth == 169:
            logits = densenet_model.densenet_imagenet_169(
                features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
        elif FLAGS.network_depth == 201:
            logits = densenet_model.densenet_imagenet_201(
                features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
        elif FLAGS.network_depth == 121:
            logits = densenet_model.densenet_imagenet_121(
                features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
        else:
            tf.logging.info("Number of layers not supported, revert to 121")
            logits = densenet_model.densenet_imagenet_121(
                features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
        logits = tf.cast(logits, tf.float32)

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, _LABEL_CLASSES)
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits, onehot_labels=one_hot_labels)

    # Add weight decay to the loss. We exclude weight decay on the batch
    # normalization variables because it slightly improves accuracy.
    loss = cross_entropy + _WEIGHT_DECAY * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if "batch_normalization" not in v.name
    ])

    global_step = tf.train.get_global_step()
    current_epoch = (tf.cast(global_step, tf.float32) /
                     params["batches_per_epoch"])
    learning_rate = learning_rate_schedule(current_epoch)

    # TODO(chrisying): this is a hack to get the LR and epoch for Tensorboard.
    # Reimplement this when TPU training summaries are supported.
    lr_repeat = tf.reshape(
        tf.tile(tf.expand_dims(learning_rate, 0), [
            params["batch_size"],
        ]), [params["batch_size"], 1])
    ce_repeat = tf.reshape(
        tf.tile(tf.expand_dims(current_epoch, 0), [
            params["batch_size"],
        ]), [params["batch_size"], 1])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=_MOMENTUM)
        if FLAGS.use_tpu == True:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)
    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits, lr_repeat, ce_repeat):
            """Evaluation metric fn. Performed on CPU, do not reference TPU ops."""
            predictions = tf.argmax(logits, axis=1)
            accuracy = tf.metrics.accuracy(tf.argmax(labels, axis=1),
                                           predictions)
            lr = tf.metrics.mean(lr_repeat)
            ce = tf.metrics.mean(ce_repeat)
            return {
                "accuracy": accuracy,
                "learning_rate": lr,
                "current_epoch": ce
            }

        eval_metrics = (metric_fn, [labels, logits, lr_repeat, ce_repeat])

    param_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=ProfileOptionBuilder.trainable_variables_parameter())
    fl_stats = tf.profiler.profile(
        tf.get_default_graph(),
        options=tf.profiler.ProfileOptionBuilder.float_operation())

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metrics=eval_metrics)
Esempio n. 24
0
def model_fn(features, labels, mode, params):

    # inference will happen in another way
    assert mode != tf.estimator.ModeKeys.PREDICT

    network = lambda images, is_training: shufflenet(
        images,
        is_training,
        num_classes=params['num_classes'],
        depth_multiplier=params['depth_multiplier'])

    # tensor `features` is a half precision tensor with shape [height, width, 3, batch_size],
    # it represents RGB images with values in [0, 1]

    images = features
    images = tf.transpose(images, [3, 0, 1, 2])  # HWCN to NHWC
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    if params['use_bfloat16']:
        with bfloat16.bfloat16_scope():
            logits = network(images, is_training)
        logits = tf.to_float(logits)  # to full precision
    else:
        logits = network(images, is_training)

    with tf.name_scope('weight_decay'):
        add_weight_decay(params['weight_decay'])
        regularization_loss = tf.losses.get_regularization_loss()

    with tf.name_scope('cross_entropy'):
        one_hot_labels = tf.one_hot(labels, params['num_classes'])
        cross_entropy = tf.losses.softmax_cross_entropy(
            logits=logits,
            onehot_labels=one_hot_labels,
            label_smoothing=LABEL_SMOOTHING)

    total_loss = tf.losses.get_total_loss(add_regularization_losses=True)

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                               loss=total_loss,
                                               eval_metrics=(metric_fn,
                                                             [labels, logits]))

    assert mode == tf.estimator.ModeKeys.TRAIN
    with tf.variable_scope('learning_rate_schedule'):
        global_step = tf.train.get_global_step()
        learning_rate = get_learning_rate(global_step, params)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops), tf.variable_scope('optimizer'):
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=MOMENTUM,
                                               use_nesterov=USE_NESTEROV)
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
        train_op = optimizer.minimize(total_loss, global_step)

    with tf.control_dependencies([train_op]), tf.name_scope('ema'):
        ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                num_updates=global_step)
        train_op = ema.apply(tf.trainable_variables())

    with tf.name_scope('train_accuracy_calculation'):
        predictions = tf.argmax(logits, axis=1, output_type=tf.int32)
        train_accuracy = tf.reduce_mean(tf.to_float(
            tf.equal(labels, predictions)),
                                        axis=0)

    tensors_to_summarize = [
        tf.reshape(global_step, [1]),
        tf.reshape(total_loss, [1]),
        tf.reshape(cross_entropy, [1]),
        tf.reshape(regularization_loss, [1]),
        tf.reshape(learning_rate, [1]),
        tf.reshape(train_accuracy, [1])
    ]

    def host_call_fn(global_step, total_loss, cross_entropy,
                     regularization_loss, learning_rate, train_accuracy):

        global_step = global_step[0]
        with summary.create_file_writer(
                params['model_dir'],
                max_queue=params['iterations_per_loop']).as_default():
            with summary.always_record_summaries():
                summary.scalar('entire_loss', total_loss[0], step=global_step)
                summary.scalar('cross_entropy_loss',
                               cross_entropy[0],
                               step=global_step)
                summary.scalar('regularization_loss',
                               regularization_loss[0],
                               step=global_step)
                summary.scalar('learning_rate',
                               learning_rate[0],
                               step=global_step)
                summary.scalar('train_accuracy',
                               train_accuracy[0],
                               step=global_step)
                return summary.all_summary_ops()

    return tf.contrib.tpu.TPUEstimatorSpec(mode=mode,
                                           loss=total_loss,
                                           train_op=train_op,
                                           host_call=(host_call_fn,
                                                      tensors_to_summarize))
Esempio n. 25
0
def model_fn(features, labels, mode, params):
    """Constructs DCGAN from individual generator and discriminator networks."""
    del labels  # Unconditional GAN does not use labels

    if mode == tf.estimator.ModeKeys.PREDICT:
        ###########
        # PREDICT #
        ###########
        # Pass only noise to PREDICT mode
        random_noise = features['random_noise']
        predictions = {
            'generated_images': model.generator(random_noise,
                                                is_training=False)
        }

        return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                              predictions=predictions)

    # Use params['batch_size'] for the batch size inside model_fn
    batch_size = params['batch_size']  # pylint: disable=unused-variable
    real_images = features['real_images']
    random_noise = features['random_noise']

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    generated_images = model.generator(random_noise, is_training=is_training)

    # Get logits from discriminator
    d_on_data_logits = tf.squeeze(model.discriminator(real_images))
    d_on_g_logits = tf.squeeze(model.discriminator(generated_images))

    # Calculate discriminator loss
    d_loss_on_data = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.ones_like(d_on_data_logits), logits=d_on_data_logits)
    d_loss_on_gen = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.zeros_like(d_on_g_logits), logits=d_on_g_logits)

    d_loss = d_loss_on_data + d_loss_on_gen

    # Calculate generator loss
    g_loss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.ones_like(d_on_g_logits), logits=d_on_g_logits)

    if mode == tf.estimator.ModeKeys.TRAIN:
        #########
        # TRAIN #
        #########
        d_loss = tf.reduce_mean(d_loss)
        g_loss = tf.reduce_mean(g_loss)
        d_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate,
                                             beta1=0.5)
        g_optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate,
                                             beta1=0.5)

        if FLAGS.use_tpu:
            d_optimizer = tpu_optimizer.CrossShardOptimizer(d_optimizer)
            g_optimizer = tpu_optimizer.CrossShardOptimizer(g_optimizer)

        with tf.control_dependencies(tf.get_collection(
                tf.GraphKeys.UPDATE_OPS)):
            d_step = d_optimizer.minimize(d_loss,
                                          var_list=tf.get_collection(
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope='Discriminator'))
            g_step = g_optimizer.minimize(g_loss,
                                          var_list=tf.get_collection(
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope='Generator'))

            increment_step = tf.assign_add(
                tf.train.get_or_create_global_step(), 1)
            joint_op = tf.group([d_step, g_step, increment_step])

            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  loss=g_loss,
                                                  train_op=joint_op)

    elif mode == tf.estimator.ModeKeys.EVAL:
        ########
        # EVAL #
        ########
        def _eval_metric_fn(d_loss, g_loss):
            # When using TPUs, this function is run on a different machine than the
            # rest of the model_fn and should not capture any Tensors defined there
            return {
                'discriminator_loss': tf.metrics.mean(d_loss),
                'generator_loss': tf.metrics.mean(g_loss)
            }

        return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                              loss=tf.reduce_mean(g_loss),
                                              eval_metrics=(_eval_metric_fn,
                                                            [d_loss, g_loss]))

    # Should never reach here
    raise ValueError('Invalid mode provided to model_fn')
Esempio n. 26
0
def resnet_model_fn(features, labels, mode, params):
    """The model_fn for ResNet to be used with TPUEstimator.

  Args:
    features: `Tensor` of batched images.
    labels: `Tensor` of labels for the data samples
    mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL}`
    params: `dict` of parameters passed to the model from the TPUEstimator,
        `params['batch_size']` is always provided and should be used as the
        effective batch size.

  Returns:
    A `TPUEstimatorSpec` for the model
  """
    if isinstance(features, dict):
        features = features['feature']

    # In most cases, the default data format NCHW instead of NHWC should be
    # used for a significant performance boost on GPU/TPU. NHWC should be used
    # only if the network needs to be run on CPU since the pooling operations
    # are only supported on NHWC.
    if FLAGS.data_format == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    network = resnet_model.resnet_v1(resnet_depth=FLAGS.resnet_depth,
                                     num_classes=LABEL_CLASSES,
                                     data_format=FLAGS.data_format)

    logits = network(inputs=features,
                     is_training=(mode == tf.estimator.ModeKeys.TRAIN))

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            export_outputs={
                'classify': tf.estimator.export.PredictOutput(predictions)
            })

    # If necessary, in the model_fn, use params['batch_size'] instead the batch
    # size flags (--train_batch_size or --eval_batch_size).
    batch_size = params['batch_size']  # pylint: disable=unused-variable

    # Calculate loss, which includes softmax cross entropy and L2 regularization.
    one_hot_labels = tf.one_hot(labels, LABEL_CLASSES)
    cross_entropy = tf.losses.softmax_cross_entropy(
        logits=logits, onehot_labels=one_hot_labels)

    # Add weight decay to the loss for non-batch-normalization variables.
    loss = cross_entropy + WEIGHT_DECAY * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    host_call = None
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Compute the current epoch and associated learning rate from global_step.
        global_step = tf.train.get_global_step()
        batches_per_epoch = NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        current_epoch = (tf.cast(global_step, tf.float32) / batches_per_epoch)
        learning_rate = learning_rate_schedule(current_epoch)

        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=MOMENTUM,
                                               use_nesterov=True)
        if FLAGS.use_tpu:
            # When using TPU, wrap the optimizer with CrossShardOptimizer which
            # handles synchronization details between different TPU cores. To the
            # user, this should look like regular synchronous training.
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        # Batch normalization requires UPDATE_OPS to be added as a dependency to
        # the train operation.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step)

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                with summary.create_file_writer(FLAGS.model_dir).as_default():
                    with summary.always_record_summaries():
                        summary.scalar('loss', loss[0], step=gs)
                        summary.scalar('learning_rate', lr[0], step=gs)
                        summary.scalar('current_epoch', ce[0], step=gs)

                        return summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            gs_t = tf.reshape(global_step, [1])
            loss_t = tf.reshape(loss, [1])
            lr_t = tf.reshape(learning_rate, [1])
            ce_t = tf.reshape(current_epoch, [1])

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'top_1_accuracy': top_1_accuracy,
                'top_5_accuracy': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          host_call=host_call,
                                          eval_metrics=eval_metrics)
Esempio n. 27
0
def model_fn(features, labels, mode, params):
    """Mobilenet v1 model using Estimator API."""
    num_classes = FLAGS.num_classes
    training_active = (mode == tf.estimator.ModeKeys.TRAIN)
    eval_active = (mode == tf.estimator.ModeKeys.EVAL)

    features = tensor_transform_fn(features, params['input_perm'])

    with bfloat16.bfloat16_scope():
        if FLAGS.clear_update_collections:
            # updates_collections must be set to None in order to use fused batchnorm
            with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
                logits, end_points = mobilenet_v1.mobilenet_v1(
                    features,
                    num_classes,
                    is_training=training_active,
                    depth_multiplier=FLAGS.depth_multiplier)
        else:
            with arg_scope(mobilenet_v1.mobilenet_v1_arg_scope()):
                logits, end_points = mobilenet_v1.mobilenet_v1(
                    features,
                    num_classes,
                    is_training=training_active,
                    depth_multiplier=FLAGS.depth_multiplier)

        logits = tf.cast(logits, tf.float32)
        for k in end_points.keys():
            end_points[k] = tf.cast(end_points[k], tf.float32)

    predictions = {
        'classes': tf.argmax(input=logits, axis=1),
        'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
    }

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not FLAGS.use_tpu):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=FLAGS.eval_batch_size,
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=FLAGS.eval_batch_size,
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)

    loss = tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                           logits=logits,
                                           weights=1.0,
                                           label_smoothing=0.1)
    #loss = tf.losses.get_total_loss(add_regularization_losses=True)
    loss += WEIGHT_DECAY * tf.add_n([
        tf.nn.l2_loss(v) for v in tf.trainable_variables()
        if 'batch_normalization' not in v.name
    ])

    initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
    final_learning_rate = 0.0001 * initial_learning_rate

    train_op = None
    if training_active:
        batches_per_epoch = _NUM_TRAIN_IMAGES // FLAGS.train_batch_size
        global_step = tf.train.get_or_create_global_step()

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=FLAGS.learning_rate_decay_epochs * batches_per_epoch,
            decay_rate=FLAGS.learning_rate_decay,
            staircase=True)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if FLAGS.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif FLAGS.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif FLAGS.optimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

        if FLAGS.use_tpu:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if FLAGS.moving_average:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

    eval_metrics = None
    if eval_active:

        def metric_fn(labels, predictions):
            accuracy = tf.metrics.accuracy(
                labels, tf.argmax(input=predictions, axis=1))
            return {'accuracy': accuracy}

        if FLAGS.use_logits:
            eval_predictions = logits
        else:
            eval_predictions = end_points['Predictions']

        eval_metrics = (metric_fn, [labels, eval_predictions])

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metrics=eval_metrics)
Esempio n. 28
0
    def model_fn(features, labels, mode, config, params):
        """Estimator model function."""

        # Not sure why it does this?
        del labels
        del config
        del params

        tf.get_variable_scope().set_initializer(
            tf.variance_scaling_initializer(1.0,
                                            mode="fan_avg",
                                            distribution="uniform"))

        # PREDICTION (e.g. evaluate)
        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions, _, _ = model_params.estimator_prediction_fn(features)

            if include_features_in_predictions:
                predictions.update(features)

            if decode_keys:
                # Decode the raw ids into strings in prediction.
                def decode_host_call(tensor_dict):
                    for key in decode_keys:
                        predictions[key] = public_parsing_ops.decode(
                            tensor_dict[key], model_params.vocab_filename,
                            model_params.encoder_type)
                    return tensor_dict

                contrib_tpu.outside_compilation(decode_host_call, predictions)
            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  predictions=predictions)

        # TRAINING
        training = mode == tf.estimator.ModeKeys.TRAIN
        # use_tpu is false by default so this skips
        if use_tpu and model_params.use_bfloat16:
            with contrib_tpu.bfloat16_scope():
                loss, outputs = model_params.model()(features, training)
        else:
            XENT_loss, outputs = model_params.model()(features, training)
            # XENT_loss, outputs = model_params.model().double_sampling(features, training, model_params.batch_size,
            #                                                           features["targets"].get_shape().as_list()[1],
            #                                                           mixed=True)

        # TPU requires outputs all have batch dimension and doesn't handle scalar.
        # Tile all scalars to 1 dimension vector.
        outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size)

        # Create optimizer and define learning rate
        if mode == tf.estimator.ModeKeys.TRAIN:
            init_lr = model_params.learning_rate
            global_step = tf.train.get_global_step()
            lr = init_lr / 0.01 * tf.rsqrt(
                tf.maximum(tf.to_float(global_step), 10000))
            if train_init_checkpoint:
                lr = tf.minimum(
                    tf.to_float(global_step + 1) / train_warmup_steps *
                    init_lr, lr)

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0)
            if use_tpu:
                optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

            ###############################################################################################################
            ##### VARIABLES ###############################################################################################
            # Create index tensors to stack and get corresponding probabilities from logp
            # max_seq_len = outputs["targets"].get_shape().as_list()[1]
            # sequence_index = tf.constant(np.arange(0, max_seq_len))
            # batch_index = tf.constant(np.zeros(sequence_index.get_shape().as_list()[0]), dtype=tf.int64)

            ##### I.I.D SAMPLING ##########################################################################################
            """ Here we sample the tokens that are produced by teacher forcing. """
            # Normalise logits to log-prob, and compute Gumbel samples with location
            # logit_probs = tf.math.softmax(outputs["logits"], axis=2)  # should not be x <= 0
            # clipped_logit_probs = tf.clip_by_value(logit_probs, 1e-8, 1.0)
            # logp = tf.log(clipped_logit_probs)

            # RETURNS TEACHER FORCING SAMPLED TOKEN VARIATIONS
            # argmax_logp_index, soft_logp_index, topk_out, z = iid_sampling(logp, max_seq_len, greedy=True, soft=False,
            #                                                                topk=False, k=2)
            # topk_probs, topk_indices = topk_out
            # TEST SAMPLING METHODS PROVIDED BY PEGASUS
            # sampled_BxT = iid_process_logits(outputs["logits"], max_seq_len, model_params.batch_size,
            #                                  outputs["logits"].get_shape().as_list()[-1],
            #                                  top_k=0, top_p=0.9, temperature=1.0)

            ##### DECODER SAMPLING ########################################################################################
            """ Here we sample the tokens using the decoder. Beam size == 1. 
            PREDS: IDs
            LOGP: transformed logits
            SCORE: scalar score using RISK trick
            LOGP: [BxTxV] beam logp
            LOGITS: [BxTxV] beam logits
            the dictionary contains the following keys: {ids, logp_BxT, sent_score, logp_BxTxV}
      # Note: the logp_BxTxV are analogous to z -> should be used for RELAX, preds are the BxT of these -> b=H(z), and
      # logp are the corresponding values (score is normalised to sentence score).
      """
            # greedy_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 0.0}
            # random_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 1.0}
            # topk_beam_params = {"_beam": 3, "top_k": 10000, "top_p": 0.0, "temperature": 1.0}
            # topp_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.9, "temperature": 1.0}

            # greedy_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                                 beam_params=greedy_beam_params, sentence_score=False)
            # random_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                                 beam_params=random_beam_params, sentence_score=False)
            # topk_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                               beam_params=topk_beam_params, sentence_score=False)
            # topp_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                               beam_params=topp_beam_params, sentence_score=False)

            # BEAM SEARCH
            # greedy_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                             beam_params=greedy_beam_params)
            # random_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                             beam_params=random_beam_params)
            # topk_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                           beam_params=topk_beam_params)
            # topp_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                           beam_params=topp_beam_params)

            ##### RELAX VARIABLES #########################################################################################
            """ Here we create the variables for RELAX. Pass in the logp, logits, and z that has already been 
      sampled/created from manipulation. Will return z_tilde [BxTxV] and logp(b) [BxT]. """
            # TEACHER FORCING SAMPLING
            # z_tilde, logp_b = create_variables(z, logp, batch_index, sequence_index, clipped_logit_probs)

            # DECODER SAMPLING -> sample_b is already argmaxed in decode loop
            # z_tilde, logp_b = create_variables_from_samples(random_dict["logits_BxTxV"], random_dict["logp_BxTxV"],
            #                                                 random_dict["ids"], batch_index, sequence_index)

            ##### TEXT AND ROUGE ##########################################################################################
            """ Here we first convert sequences to text, and calculate corresponding rouge scores/losses. """
            # target_text = rouge_decoding(outputs["targets"], model_params)  # TARGET SAMPLES
            # argmax_pred_text = rouge_decoding(argmax_logp_index, model_params)  # ARGMAX SAMPLES
            # soft_pred_text = rouge_decoding(soft_logp_index, model_params)  # SOFTMAX SAMPLES
            # additional_pred_text = rouge_decoding(sampled_BxT, model_params)  # ADDITIONAL SAMPLES

            # Token-level ROUGE
            # ROUGE_token = tf.py_function(rouge_token,(outputs["targets"], random_dict["ids"], 0, 0), tf.float32)

            # CALCULATE ROUGE LOSS: ROUGE score -> ROUGE loss = -ROUGE score
            # NOTE: for ROUGE variant, change value (0: precision, 1: recall, 2: f1)
            # rouge_loss_argmax = -tf.py_function(evaluate_rl, (target_text, argmax_pred_text, 2), tf.float32)
            # rouge_loss_soft = -tf.py_function(evaluate_rl, (target_text, soft_pred_text, 2), tf.float32)
            # rouge_loss_extra = -tf.py_function(evaluate_rl, (target_text, additional_pred_text, 2), tf.float32)

            ##### REINFORCE LOSS ##########################################################################################
            """ Calculate standard REINFORCE loss. Can be document-level (score using RISK trick), or token-level [BxT]. """
            # FIND CORRESPONDING LOG_PROBS OF THE I.I.D SAMPLED TOKENS
            # ARGMAX -> logp(argmax(y))
            # argmax_logp = iid_log_probs(argmax_logp_index, batch_index, sequence_index, logp)
            # SOFTMAX -> logp(sample_y)
            # softmax_logp = iid_log_probs(soft_logp_index, batch_index, sequence_index, logp)
            # ADDITIONAL
            # additional_logp = iid_log_probs(sampled_BxT, batch_index, sequence_index, logp)

            # CHANGE BELOW IF USING DECODER SAMPLED TOKENS/SCORES
            # weight the logp by ROUGE score (neg ROUGE_loss), sum values
            # reinforce_loss = tf.reduce_sum(tf.multiply(rouge_loss_argmax, argmax_logp))

            ##### REINFORCE w/ BASELINE ###################################################################################
            """ Calculate RwB using Socher's loss function (2017). Optional: use a Q_func as baseline. """
            # improve the probs of the SOFT labels (soft - hard)*soft_logp
            # improve the probs of the HARD labels (hard - soft)*hard_logp

            # BASELINE: CONTROL VARIATE
            # ffn_output = control_variate(source, targets)
            # with tf.variable_scope("Q_func"):
            #   cv = rwb_Q_func(tf.reshape(softmax_logp, [1, 32]), tf.reshape(additional_logp, [1, 32]))

            # cv_loss = tf.reduce_mean(tf.square(tf.subtract(rouge_loss_argmax, cv)))

            # loss_difference = tf.subtract(rouge_loss_soft, rouge_loss_argmax)
            # reinforce_baseline = tf.reduce_sum(tf.multiply(loss_difference, softmax_logp))

            # BASELINE: HINGE LOSS
            # rouge_soft = -rouge_loss_soft
            # rouge_hard = -rouge_loss_argmax
            # hinge = -tf.maximum((rouge_soft - rouge_hard), 0)
            # hinge_baseline = tf.reduce_sum(tf.multiply(hinge, softmax_logp))

            ##### REINFORCE w/ THRESHOLD ##################################################################################
            """ Calculate REINFORCE with a constant threshold as the baseline. """
            # we take output of ROUGE score as ROUGE_loss = -ROUGE score
            # intermediate_loss = tf.reduce_sum(tf.multiply(tf.subtract(0.3, -rouge_loss_argmax), argmax_logp))

            ##### EXPECTED RISK MINIMISATION ##############################################################################
            """ Calculate the RISK loss using n sequences from sampling process. """
            # L_risk = risk_loss(model_params.batch_size, max_seq_len,
            #                    rouge_losses=[rouge_loss_argmax, rouge_loss_soft, rouge_loss_extra],
            #                    logps=[topk_dict["logp1"], topk_dict["logp2"], topk_dict["logp3"]], n=3)

            ##### MIXED LOSS ##############################################################################################
            """ Implement a mixed loss function that is weighted by an alpha term. """
            # combined_loss = tf.math.add(tf.multiply(tf.constant(0.3, dtype=tf.float32), XENT_loss),
            #                             tf.multiply(tf.constant(0.7, dtype=tf.float32), L_risk))

            # OR conditional loss switch
            # constraint = tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
            # combined_loss = tf.cond(constraint > 0.8, lambda: hard_reinforce_loss, lambda: XENT_loss)

            ##### RELAX CONTROL VARIATE ###################################################################################
            """ Prepare the target sequence for use in the control variate. """
            # z = random_dict["logp_BxTxV"]
            # z_target, zt_target = create_cv_target(outputs, batch_index, sequence_index, z, z_tilde)

            ##### RELAX LOSS ##############################################################################################
            """ Manipulate z and z_tilde using the Q_func to mimic ROUGE loss. """
            # with tf.variable_scope("Q_func"):
            #     c_z = Q_func(z, z_target)

            # with tf.variable_scope("Q_func", reuse=True):
            #     c_z_tilde = Q_func(z_tilde, zt_target)

            # Formulate RELAX as a loss function
            # f_y = rouge_loss_soft  # negative for loss (defined above)
            # c_z_tilde1 = tf.stop_gradient(tf.identity(c_z_tilde))  # clone, detach, stop grad
            # L_relax = tf.reduce_sum(((f_y - c_z_tilde1)*logp_b) - c_z_tilde + c_z)

            # OR construct gradient estimator
            # theta = [tv for tv in tf.trainable_variables() if "Q_func" not in tv.name]
            # d_logp_d_theta = tf.gradients(logp_b, theta)[0]  # logp
            # d_c_z_tilde_d_theta = tf.gradients(c_z_tilde, theta)[0]
            # d_c_z_d_theta = tf.gradients(c_z, theta)[0]
            # relax = tf.reduce_sum(f_y - c_z_tilde)*d_logp_d_theta - d_c_z_tilde_d_theta + d_c_z_d_theta

            # relax = tf.gradients(L_relax, theta)[0]

            # Calculate the first optimization step with loss
            # list_of_gradient_variable_pairs = optimizer.compute_gradients(L_relax)
            # train_op = optimizer.apply_gradients(list_of_gradient_variable_pairs, global_step=global_step)

            # Variance reduction objective
            # variance_loss = tf.reduce_mean(tf.square(relax), name="variance_loss")

            # initialise adafactor again for variance optimiser
            # var_opt = adafactor.AdafactorOptimizer(
            #           learning_rate=lr,
            #           decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
            #           beta1=0.0)

            # est_params = [eta, log_temperature]  # TODO: REBAR implementation

            # Adds the parameters of the FFNN
            # nn_params = [tv for tv in tf.trainable_variables() if "Q_func" in tv.name]
            # est_params = nn_params
            # est_params = est_params + nn_params  # TODO: REBAR implementation

            # Additional optimization step
            # var_gradvars = var_opt.compute_gradients(variance_loss, var_list=est_params)
            # var_train_op = var_opt.apply_gradients(var_gradvars)

            # This may allow for both train ops to be passed in the return statement below?
            # with tf.control_dependencies([train_op, var_train_op]):
            #     train_op = tf.no_op()

            ###############################################################################################################
            # Calculate gradients
            # If freezing layers, only optimise wrt certain layers (find names) - speeds up, worsens performance
            # last_params = [tv for tv in tf.trainable_variables() if "decoder/LayerNorm/" in tv.name]
            # list_of_gradient_variable_pairs = optimizer.compute_gradients(combined_loss, var_list=last_params)

            list_of_gradient_variable_pairs = optimizer.compute_gradients(
                XENT_loss)
            train_op = optimizer.apply_gradients(
                list_of_gradient_variable_pairs, global_step=global_step)

            tf.logging.set_verbosity(tf.logging.INFO)
            # Debugging steps - add into logging hook directly if needed
            # tf.debugging.check_numerics(sum_logp, "DEBUG: sum_logp has a NaN")

            logging_hook = tf.train.LoggingTensorHook(
                {
                    "loss": XENT_loss,
                    # "variance_loss": variance_loss,
                    # "cv_loss": cv_loss,
                    "learning_rate": lr,
                    "global_step": global_step,
                },
                every_n_iter=5)

            # This is the configured estimator function that is returned to train the model
            return tpu_estimator.TPUEstimatorSpec(
                mode=mode,
                loss=XENT_loss,
                train_op=train_op,
                training_hooks=[logging_hook],
                scaffold_fn=_load_vars_from_checkpoint(use_tpu,
                                                       train_init_checkpoint),
                host_call=add_scalars_to_summary(
                    model_dir,
                    {
                        "learning_rate": lr,
                        # "rouge_loss_hard": rouge_loss_argmax,
                        # "rouge_loss_soft": rouge_loss_soft,
                        # "rouge_loss_extra": rouge_loss_extra,
                        # "reinforce_loss": reinforce_loss,
                        # "risk_loss": L_risk,
                        # "XENT_loss": XENT_loss,
                    }))

        # EVALUATION (evaluating the performance)
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics = model_params.estimator_eval_metrics_fn(
                features, outputs)
            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  loss=XENT_loss,
                                                  eval_metrics=eval_metrics)
Esempio n. 29
0
def inception_model_fn(features, labels, mode, params):
    """Inception v3 model using Estimator API."""
    num_classes = FLAGS.num_classes
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
    is_eval = (mode == tf.estimator.ModeKeys.EVAL)
    features = tensor_transform_fn(features, params['input_perm'])

    # This nested function allows us to avoid duplicating the logic which
    # builds the network, for different values of --precision.
    def build_network():
        if FLAGS.precision == 'bfloat16':
            with bfloat16.bfloat16_scope():
                logits, end_points = inception.inception_v3(
                    features, num_classes, is_training=is_training)
            logits = tf.cast(logits, tf.float32)
        elif FLAGS.precision == 'float32':
            logits, end_points = inception.inception_v3(
                features, num_classes, is_training=is_training)
        return logits, end_points

    if FLAGS.clear_update_collections:
        # updates_collections must be set to None in order to use fused batchnorm
        with arg_scope(
                inception.inception_v3_arg_scope(
                    weight_decay=0.0,
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON,
                    updates_collections=None)):
            logits, end_points = build_network()
    else:
        with arg_scope(
                inception.inception_v3_arg_scope(
                    batch_norm_decay=BATCH_NORM_DECAY,
                    batch_norm_epsilon=BATCH_NORM_EPSILON)):
            logits, end_points = build_network()

    predictions = end_points
    predictions.update({
        'classes':
        tf.argmax(input=logits, axis=1),
        'probabilities':
        tf.nn.softmax(logits, name='softmax_tensor')
    })

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    if mode == tf.estimator.ModeKeys.EVAL and FLAGS.display_tensors and (
            not FLAGS.use_tpu):
        with tf.control_dependencies([
                tf.Print(predictions['classes'], [predictions['classes']],
                         summarize=FLAGS.eval_batch_size,
                         message='prediction: ')
        ]):
            labels = tf.Print(labels, [labels],
                              summarize=FLAGS.eval_batch_size,
                              message='label: ')

    one_hot_labels = tf.one_hot(labels, FLAGS.num_classes, dtype=tf.int32)

    if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                        logits=tf.cast(end_points['AuxLogits'],
                                                       tf.float32),
                                        weights=0.4,
                                        label_smoothing=0.1,
                                        scope='aux_loss')

    tf.losses.softmax_cross_entropy(onehot_labels=one_hot_labels,
                                    logits=logits,
                                    weights=1.0,
                                    label_smoothing=0.1)

    losses = tf.add_n(tf.losses.get_losses())
    l2_loss = []
    for v in tf.trainable_variables():
        if 'BatchNorm' not in v.name and 'weights' in v.name:
            l2_loss.append(tf.nn.l2_loss(v))
    loss = losses + WEIGHT_DECAY * tf.add_n(l2_loss)

    initial_learning_rate = FLAGS.learning_rate * FLAGS.train_batch_size / 256
    if FLAGS.use_learning_rate_warmup:
        # Adjust initial learning rate to match final warmup rate
        warmup_decay = FLAGS.learning_rate_decay**(
            (FLAGS.warmup_epochs + FLAGS.cold_epochs) /
            FLAGS.learning_rate_decay_epochs)
        adj_initial_learning_rate = initial_learning_rate * warmup_decay

    final_learning_rate = 0.0001 * initial_learning_rate

    host_call = None
    train_op = None
    if is_training:
        batches_per_epoch = _NUM_TRAIN_IMAGES / FLAGS.train_batch_size
        global_step = tf.train.get_or_create_global_step()
        current_epoch = tf.cast(
            (tf.cast(global_step, tf.float32) / batches_per_epoch), tf.int32)

        learning_rate = tf.train.exponential_decay(
            learning_rate=initial_learning_rate,
            global_step=global_step,
            decay_steps=int(FLAGS.learning_rate_decay_epochs *
                            batches_per_epoch),
            decay_rate=FLAGS.learning_rate_decay,
            staircase=True)

        if FLAGS.use_learning_rate_warmup:
            wlr = 0.1 * adj_initial_learning_rate
            wlr_height = tf.cast(
                0.9 * adj_initial_learning_rate /
                (FLAGS.warmup_epochs + FLAGS.learning_rate_decay_epochs - 1),
                tf.float32)
            epoch_offset = tf.cast(FLAGS.cold_epochs - 1, tf.int32)
            exp_decay_start = (FLAGS.warmup_epochs + FLAGS.cold_epochs +
                               FLAGS.learning_rate_decay_epochs)
            lin_inc_lr = tf.add(
                wlr,
                tf.multiply(
                    tf.cast(tf.subtract(current_epoch, epoch_offset),
                            tf.float32), wlr_height))
            learning_rate = tf.where(
                tf.greater_equal(current_epoch, FLAGS.cold_epochs),
                (tf.where(tf.greater_equal(current_epoch, exp_decay_start),
                          learning_rate, lin_inc_lr)), wlr)

        # Set a minimum boundary for the learning rate.
        learning_rate = tf.maximum(learning_rate,
                                   final_learning_rate,
                                   name='learning_rate')

        if FLAGS.optimizer == 'sgd':
            tf.logging.info('Using SGD optimizer')
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        elif FLAGS.optimizer == 'momentum':
            tf.logging.info('Using Momentum optimizer')
            optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                                   momentum=0.9)
        elif FLAGS.optimizer == 'RMS':
            tf.logging.info('Using RMS optimizer')
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  RMSPROP_DECAY,
                                                  momentum=RMSPROP_MOMENTUM,
                                                  epsilon=RMSPROP_EPSILON)
        else:
            tf.logging.fatal('Unknown optimizer:', FLAGS.optimizer)

        if FLAGS.use_tpu:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(loss, global_step=global_step)
        if FLAGS.moving_average:
            ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY,
                                                    num_updates=global_step)
            variables_to_average = (tf.trainable_variables() +
                                    tf.moving_average_variables())
            with tf.control_dependencies([train_op
                                          ]), tf.name_scope('moving_average'):
                train_op = ema.apply(variables_to_average)

        # To log the loss, current learning rate, and epoch for Tensorboard, the
        # summary op needs to be run on the host CPU via host_call. host_call
        # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
        # dimension. These Tensors are implicitly concatenated to
        # [params['batch_size']].
        gs_t = tf.reshape(global_step, [1])
        loss_t = tf.reshape(loss, [1])
        lr_t = tf.reshape(learning_rate, [1])
        ce_t = tf.reshape(current_epoch, [1])

        if not FLAGS.skip_host_call:

            def host_call_fn(gs, loss, lr, ce):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide them as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          gs: `Tensor with shape `[batch]` for the global_step
          loss: `Tensor` with shape `[batch]` for the training loss.
          lr: `Tensor` with shape `[batch]` for the learning_rate.
          ce: `Tensor` with shape `[batch]` for the current_epoch.

        Returns:
          List of summary ops to run on the CPU host.
        """
                gs = gs[0]
                with summary.create_file_writer(FLAGS.model_dir).as_default():
                    with summary.always_record_summaries():
                        summary.scalar('loss', tf.reduce_mean(loss), step=gs)
                        summary.scalar('learning_rate',
                                       tf.reduce_mean(lr),
                                       step=gs)
                        summary.scalar('current_epoch',
                                       tf.reduce_mean(ce),
                                       step=gs)

                        return summary.all_summary_ops()

            host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t])

    eval_metrics = None
    if is_eval:

        def metric_fn(labels, logits):
            """Evaluation metric function. Evaluates accuracy.

      This function is executed on the CPU and should not directly reference
      any Tensors in the rest of the `model_fn`. To pass Tensors from the model
      to the `metric_fn`, provide as part of the `eval_metrics`. See
      https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
      for more information.

      Arguments should match the list of `Tensor` objects passed as the second
      element in the tuple passed to `eval_metrics`.

      Args:
        labels: `Tensor` with shape `[batch, ]`.
        logits: `Tensor` with shape `[batch, num_classes]`.

      Returns:
        A dict of the metrics to return from evaluation.
      """
            predictions = tf.argmax(logits, axis=1)
            top_1_accuracy = tf.metrics.accuracy(labels, predictions)
            in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32)
            top_5_accuracy = tf.metrics.mean(in_top_5)

            return {
                'accuracy': top_1_accuracy,
                'accuracy@5': top_5_accuracy,
            }

        eval_metrics = (metric_fn, [labels, logits])

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          host_call=host_call,
                                          eval_metrics=eval_metrics)
Esempio n. 30
0
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    detection_model = detection_model_fn(is_training=is_training,
                                         add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      labels = unstack_batch(labels, unpad_groundtruth_tensors=False)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list)

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
        preprocessed_images, features[fields.InputDataFields.true_image_shape])
    detections = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        asg_map = detection_model.restore_map(
            from_detection_checkpoint=train_config.from_detection_checkpoint,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map, train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.itervalues()]
      total_loss = tf.add_n(losses, name='total_loss')

    if mode == tf.estimator.ModeKeys.TRAIN:
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

      if use_tpu:
        training_optimizer = tpu_optimizer.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      if train_config.freeze_variables:
        trainable_variables = tf.contrib.framework.filter_variables(
            tf.trainable_variables(),
            exclude_patterns=train_config.freeze_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(detections)
      }

    eval_metric_ops = None
    if mode == tf.estimator.ModeKeys.EVAL:
      # Detection summaries during eval.
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _get_groundtruth_data(detection_model, class_agnostic)
      eval_dict = eval_util.result_dict_for_single_example(
          tf.expand_dims(features[fields.InputDataFields.original_image][0], 0),
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=False)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image(
          eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2)
      if not use_tpu:
        tf.summary.image('Detections_Left_Groundtruth_Right',
                         detection_and_groundtruth)

      # Eval metrics on a single image.
      detection_fields = fields.DetectionResultFields()
      input_data_fields = fields.InputDataFields()
      coco_evaluator = coco_evaluation.CocoDetectionEvaluator(
          category_index.values())
      eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops(
          image_id=eval_dict[input_data_fields.key],
          groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes],
          groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes],
          detection_boxes=eval_dict[detection_fields.detection_boxes],
          detection_scores=eval_dict[detection_fields.detection_scores],
          detection_classes=eval_dict[detection_fields.detection_classes])

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs)