Example #1
0
def define_ppo_step(data_points, hparams, action_space, lr):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    obs_shape = common_layers.shape_list(observation)
    observation = tf.reshape(observation,
                             [obs_shape[0] * obs_shape[1]] + obs_shape[2:])
    (logits, new_value) = get_policy(observation, hparams, action_space)
    logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
    new_value = tf.reshape(new_value, obs_shape[:2])
    new_policy_dist = tf.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    variables = tf.global_variables(hparams.policy_network + "/.*")
    train_op = optimize.optimize(loss, lr, hparams, variables=variables)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
 def optimize(self, loss, num_async_replicas=1):
   """Return a training op minimizing loss."""
   use_tpu = self.hparams.use_tpu
   lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)
   return train_op
Example #3
0
def define_ppo_step(data_points, hparams, action_space, lr):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    (logits, new_value) = get_policy(observation, hparams, action_space)
    new_policy_dist = tfp.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    train_op = optimize.optimize(loss, lr, hparams)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
Example #4
0
def define_ppo_step(data_points,
                    hparams,
                    action_space,
                    lr,
                    distributional_size=1,
                    distributional_subscale=0.04):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    obs_shape = common_layers.shape_list(observation)
    observation = tf.reshape(observation,
                             [obs_shape[0] * obs_shape[1]] + obs_shape[2:])
    (logits, new_value) = get_policy(observation,
                                     hparams,
                                     action_space,
                                     distributional_size=distributional_size)
    logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
    new_policy_dist = tfp.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    if distributional_size > 1:
        new_value = tf.reshape(new_value,
                               obs_shape[:2] + [distributional_size])
        new_value = tf.nn.log_softmax(new_value, axis=-1)
        # We assume the values range from (-half, half) -- set subscale accordingly.
        half = (distributional_size // 2) * distributional_subscale
        # To make values integers, we add half (to move range to (0, 2*half) and
        # then multiply by subscale after which we floor to get nearest int.
        quantized_dr = tf.floor(
            (discounted_reward + half) / distributional_subscale)
        hot_dr = tf.one_hot(tf.cast(quantized_dr, tf.int32),
                            distributional_size)
        value_loss = -tf.reduce_sum(new_value * hot_dr, axis=-1)
        value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss)
    else:
        new_value = tf.reshape(new_value, obs_shape[:2])
        value_error = new_value - discounted_reward
        value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    variables = tf.global_variables(hparams.policy_network + "/.*")
    train_op = optimize.optimize(loss, lr, hparams, variables=variables)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
 def optimize(self, loss, num_async_replicas=1, use_tpu=False, variables=None):
   """Return a training op minimizing loss."""
   lr = ops.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     t2t_model.log_info("Dividing learning rate by num_async_replicas: %d",
                        num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=use_tpu, variables=variables)
   return train_op
Example #6
0
 def model_fn(features, labels, mode):
     """The model function for creating an Estimtator."""
     del labels
     input_count = tf.reduce_sum(
         tf.to_int32(
             tf.greater(features["input_refs"][:, :, 1],
                        features["input_refs"][:, :, 0])))
     tf.summary.scalar("input_count", input_count)
     loss_dict, pred_dict, areas = seq2act_model.core_graph(
         features, hparams, mode, compute_additional_loss_fn)
     if mode == tf.estimator.ModeKeys.PREDICT:
         pred_dict["sequences"] = decode_sequence(
             features,
             areas,
             hparams,
             decode_length,
             post_processing=FLAGS.post_processing)
         return tf.estimator.EstimatorSpec(mode, predictions=pred_dict)
     elif mode == tf.estimator.ModeKeys.EVAL:
         metrics = {}
         _eval(metrics,
               pred_dict,
               loss_dict,
               features,
               areas,
               compute_seq_accuracy,
               hparams,
               metric_types=FLAGS.metric_types.split(","),
               decode_length=decode_length)
         if compute_additional_metric_fn:
             compute_additional_metric_fn(metrics, pred_dict, features)
         return tf.estimator.EstimatorSpec(mode,
                                           loss=loss_dict["total_loss"],
                                           eval_metric_ops=metrics)
     else:
         assert mode == tf.estimator.ModeKeys.TRAIN
         loss = loss_dict["total_loss"]
         for loss_name in loss_dict:
             if loss_name == "total_loss":
                 continue
             if loss_name.endswith("losses"):
                 continue
             tf.summary.scalar(loss_name, loss_dict[loss_name])
         step_num = tf.to_float(tf.train.get_global_step())
         schedule_string = hparams.learning_rate_schedule
         names = schedule_string.split("*")
         names = [name.strip() for name in names if name.strip()]
         ret = tf.constant(1.0)
         for name in names:
             ret *= learning_rate.learning_rate_factor(
                 name, step_num, hparams)
         train_op = optimize.optimize(loss, ret, hparams)
         return tf.estimator.EstimatorSpec(mode,
                                           loss=loss,
                                           train_op=train_op)
Example #7
0
 def optimize(self, loss, num_async_replicas=1):
   """Return a training op minimizing loss."""
   log_info("Base learning rate: %f", self.hparams.learning_rate)
   lr = learning_rate.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     log_info("Dividing learning rate by num_async_replicas: %d",
              num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu())
   return train_op
Example #8
0
 def optimize(self, loss, num_async_replicas=1):
   """Return a training op minimizing loss."""
   log_info("Base learning rate: %f", self.hparams.learning_rate)
   lr = learning_rate.learning_rate_schedule(self.hparams)
   if num_async_replicas > 1:
     log_info("Dividing learning rate by num_async_replicas: %d",
              num_async_replicas)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(
       loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu())
   return train_op
Example #9
0
  def estimator_spec_train(self, loss, use_tpu=False):
    """Construct EstimatorSpec for TRAIN mode."""
    lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams)
    train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)

    if use_tpu:
      _remove_summaries()  # summaries not currently working on TPU
      return tf.contrib.tpu.TPUEstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
    else:
      return tf.estimator.EstimatorSpec(
          tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
Example #10
0
    def estimator_spec_train(self, loss, use_tpu=False):
        """Construct EstimatorSpec for TRAIN mode."""
        lr = self.hparams.learning_rate * optimize.learning_rate_decay(
            self.hparams)
        train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)

        if use_tpu:
            _remove_summaries()  # summaries not currently working on TPU
            return tf.contrib.tpu.TPUEstimatorSpec(tf.estimator.ModeKeys.TRAIN,
                                                   loss=loss,
                                                   train_op=train_op)
        else:
            return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.TRAIN,
                                              loss=loss,
                                              train_op=train_op)
Example #11
0
 def _test_resnet(self, img_size, output_size):
     vocab_size = 1
     batch_size = 1
     x = np.random.random_integers(0,
                                   high=255,
                                   size=(batch_size, img_size, img_size, 3))
     y = np.random.random_integers(1,
                                   high=vocab_size,
                                   size=(batch_size, 1, 1, 1))
     #hparams = resnet_tiny_cpu()
     #hparams = resnet_50()
     hparams = resnet_32()
     p_hparams = problem_hparams.test_problem_hparams(
         vocab_size, vocab_size, hparams)
     p_hparams.input_modality["inputs"] = modalities.ImageModality(hparams)
     p_hparams.target_modality = modalities.ClassLabelModality(
         hparams, vocab_size)
     run_meta = tf.RunMetadata()
     with self.test_session() as session:
         features = {
             "inputs": tf.constant(x, dtype=tf.int32),
             "targets": tf.constant(y, dtype=tf.int32),
         }
         #model = resnet.Resnet(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)
         model = shake_shake.ShakeShake(hparams,
                                        tf.estimator.ModeKeys.TRAIN,
                                        p_hparams)
         logits, _ = model(features)
         print(logits.get_shape())
         #opts = tf.profiler.ProfileOptionBuilder.float_operation()
         #flops = tf.profiler.profile(tf.get_default_graph(), run_meta=run_meta, options=opts)
         #print(flops.total_float_ops)
         session.run(tf.global_variables_initializer())
         #res = session.run(logits)
         tf.get_variable_scope().set_initializer(
             optimize.get_variable_initializer(hparams))
         loss = tf.losses.sparse_softmax_cross_entropy(labels=tf.constant(
             0, dtype=tf.int32, shape=[1, 1, 1, 1, 1]),
                                                       logits=logits)
         train_op = optimize.optimize(loss, 0.1, hparams)
         session.run(loss)
         opts = tf.profiler.ProfileOptionBuilder.float_operation()
         flops = tf.profiler.profile(tf.get_default_graph(),
                                     run_meta=run_meta,
                                     options=opts)
         print(flops.total_float_ops)
Example #12
0
 def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
     tf.logging.info("Base learning rate: %f", self.hparams.learning_rate)
     lr = self.hparams.learning_rate
     decay_rate = optimize.learning_rate_schedule(self.hparams)
     lr *= decay_rate
     if self.hparams.learning_rate_minimum:
         lr_min = float(self.hparams.learning_rate_minimum)
         tf.logging.info("Applying learning rate minimum: %f", lr_min)
         lr = tf.max(lr, tf.to_float(lr_min))
     if num_async_replicas > 1:
         tf.logging.info("Dividing learning rate by num_async_replicas: %d",
                         num_async_replicas)
     lr /= math.sqrt(float(num_async_replicas))
     train_op = optimize.optimize(loss,
                                  lr,
                                  self.hparams,
                                  use_tpu=common_layers.is_on_tpu())
     return train_op
Example #13
0
 def optimize(self, loss, use_tpu=False):
   """Return a training op minimizing loss."""
   lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams)
   train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)
   return train_op
Example #14
0
 def optimize(self, loss, use_tpu=False):
     """Return a training op minimizing loss."""
     lr = self.hparams.learning_rate * optimize.learning_rate_decay(
         self.hparams)
     train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)
     return train_op
Example #15
0
def define_ppo_step(data_points, hparams, action_space, lr, epoch=-1,
                    distributional_size=1, distributional_subscale=0.04):
  """Define ppo step."""
  del distributional_subscale
  (observation, action, discounted_reward, discounted_reward_probs,
   norm_advantage, old_pdf) = data_points

  obs_shape = common_layers.shape_list(observation)
  observation = tf.reshape(
      observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]
  )
  (logits, new_value) = get_policy(observation, hparams, action_space,
                                   epoch=epoch,
                                   distributional_size=distributional_size)
  logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
  new_policy_dist = tfp.distributions.Categorical(logits=logits)

  new_pdf = new_policy_dist.prob(action)

  ratio = new_pdf / old_pdf
  clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                   1 + hparams.clipping_coef)

  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                   ratio * norm_advantage)
  policy_loss = -tf.reduce_mean(surrogate_objective)

  if distributional_size > 1:
    new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size])
    new_value = tf.nn.log_softmax(new_value, axis=-1)
    value_shape = common_layers.shape_list(new_value)
    # The above is the new value distribution. We are also given as discounted
    # reward the value distribution and the corresponding probabilities.
    # The given discounted reward is already rounded to integers but in range
    # increased by 2x for greater fidelity. Increase range of new_values here.
    new_value_shifted = tf.concat([new_value[1:], new_value[-1:]], axis=0)
    new_value_mean = (new_value + new_value_shifted) / 2
    new_value = tf.concat([tf.expand_dims(new_value, axis=-1),
                           tf.expand_dims(new_value_mean, axis=-1)], -1)
    new_value = tf.reshape(new_value, value_shape[:-1] + [2 * value_shape[-1]])
    # Cast discounted reward to integers and gather the new log-probs for them.
    discounted_reward = tf.cast(discounted_reward, tf.int32)
    value_loss = tf.batch_gather(new_value, discounted_reward)
    # Weight the gathered (new) log-probs by the old probabilities.
    discounted_reward_probs = tf.expand_dims(discounted_reward_probs, axis=1)
    value_loss = - tf.reduce_sum(value_loss * discounted_reward_probs, axis=-1)
    # Take the mean over batch and time as final loss, multiply by coefficient.
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss)
  else:
    new_value = tf.reshape(new_value, obs_shape[:2])
    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2)

  entropy = new_policy_dist.entropy()
  entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

  losses = [policy_loss, value_loss, entropy_loss]
  loss = sum(losses)
  variables = tf.global_variables(hparams.policy_network + "/.*")
  train_op = optimize.optimize(loss, lr, hparams, variables=variables)

  with tf.control_dependencies([train_op]):
    return [tf.identity(x) for x in losses]
    def build_model(self):
        # build index table
        index_table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=self.config.vocab_list,
            num_oov_buckets=0,
            default_value=0)

        # get data iterator
        self.data_iterator = self.data.get_data_iterator(index_table,
                                                         mode=self.mode)

        # get inputs
        with tf.variable_scope("inputs"):
            # get next batch if there is no feeded data
            next_batch = self.data_iterator.get_next()
            self.input_queries = tf.placeholder_with_default(
                next_batch["input_queries"], [None, self.config.max_length],
                name="input_queries")
            self.input_replies = tf.placeholder_with_default(
                next_batch["input_replies"], [None, self.config.max_length],
                name="input_replies")
            self.query_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["query_lengths"]), [None],
                name="query_lengths")
            self.reply_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["reply_lengths"]), [None],
                name="reply_lengths")

            # get hyperparams
            self.embed_dropout_keep_prob = tf.placeholder(
                tf.float64, name="embed_dropout_keep_prob")
            self.lstm_dropout_keep_prob = tf.placeholder(
                tf.float32, name="lstm_dropout_keep_prob")
            self.dense_dropout_keep_prob = tf.placeholder(
                tf.float32, name="dense_dropout_keep_prob")
            self.num_negative_samples = tf.placeholder(
                tf.int32, name="num_negative_samples")

        with tf.variable_scope("properties"):
            # length properties
            cur_batch_length = tf.shape(self.input_queries)[0]

            # get hparms from tensor2tensor.models.transformer
            hparams = transformer.transformer_small()
            hparams.batch_size = self.config.batch_size

            # learning rate
            lr = learning_rate.learning_rate_schedule(hparams)

        # embedding layer
        with tf.variable_scope("embedding"):
            embeddings = tf.Variable(get_embeddings(
                self.config.vocab_list, self.config.pretrained_embed_dir,
                self.config.vocab_size, self.config.embed_dim),
                                     trainable=True,
                                     name="embeddings")
            embeddings = tf.nn.dropout(
                embeddings,
                keep_prob=self.embed_dropout_keep_prob,
                noise_shape=[tf.shape(embeddings)[0], 1])
            queries_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_queries,
                                       name="queries_embedded"))
            replies_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_replies,
                                       name="replies_embedded"))

            self.queries_embedded = queries_embedded
            self.replies_embedded = replies_embedded

        # transformer layer
        with tf.variable_scope("transformer"):
            queries_expanded = tf.expand_dims(queries_embedded,
                                              axis=2,
                                              name="queries_expanded")
            replies_expanded = tf.expand_dims(replies_embedded,
                                              axis=2,
                                              name="replies_expanded")

            hparams = transformer.transformer_small()
            hparams.set_hparam("batch_size", self.config.batch_size)
            hparams.set_hparam("hidden_size", self.config.embed_dim)
            encoder = transformer.TransformerEncoder(hparams, mode=self.mode)

            self.queries_encoded = encoder({
                "inputs": queries_expanded,
                "targets": queries_expanded
            })[0]
            self.replies_encoded = encoder({
                "inputs": replies_expanded,
                "targets": replies_expanded
            })[0]

            self.queries_pooled = tf.nn.max_pool(
                self.queries_encoded,
                ksize=[1, self.config.max_length, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="queries_pooled")
            self.replies_pooled = tf.nn.max_pool(
                self.replies_encoded,
                ksize=[1, self.config.max_length, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="replies_pooled")

            self.queries_flattened = tf.reshape(self.queries_pooled,
                                                [cur_batch_length, -1])
            self.replies_flattened = tf.reshape(self.replies_pooled,
                                                [cur_batch_length, -1])

        # build dense layer
        with tf.variable_scope("dense_layer"):
            M = tf.get_variable(
                "M",
                shape=[self.config.embed_dim, self.config.embed_dim],
                initializer=tf.initializers.truncated_normal())
            M = tf.nn.dropout(M, self.dense_dropout_keep_prob)
            self.queries_transformed = tf.matmul(self.queries_flattened, M)

        with tf.variable_scope("sampling"):
            self.distances = tf.matmul(self.queries_transformed,
                                       self.replies_flattened,
                                       transpose_b=True)
            positive_mask = tf.reshape(tf.eye(cur_batch_length), [-1])
            negative_mask = tf.reshape(
                make_negative_mask(
                    self.distances,
                    method=self.config.negative_sampling,
                    num_negative_samples=self.num_negative_samples), [-1])

        with tf.variable_scope("prediction"):
            distances_flattened = tf.reshape(self.distances, [-1])
            self.positive_logits = tf.gather(distances_flattened,
                                             tf.where(positive_mask), 1)
            self.negative_logits = tf.gather(distances_flattened,
                                             tf.where(negative_mask), 1)

            self.logits = tf.concat(
                [self.positive_logits, self.negative_logits], axis=0)
            self.labels = tf.concat([
                tf.ones_like(self.positive_logits),
                tf.zeros_like(self.negative_logits)
            ],
                                    axis=0)

            self.positive_probs = tf.sigmoid(self.positive_logits)

            self.probs = tf.sigmoid(self.logits)
            self.predictions = tf.cast(self.probs > 0.5, dtype=tf.int32)

        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels,
                                                        logits=self.logits))
            self.train_step = optimize.optimize(self.loss,
                                                lr,
                                                hparams,
                                                use_tpu=False)

        with tf.variable_scope("score"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.to_int32(self.labels))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   "float"),
                                           name="accuracy")
Example #17
0
    def __init__(self, gpu, checkpoints, config=None):
        self._logger = logging.getLogger('TransformerDecoder')
        self._settings = config.settings if config is not None else TransformerDecoder.Settings(
        )
        self._checkpoints = checkpoints
        self._checkpoint = None
        self._nn_needs_reset = True

        with tf.device('/device:GPU:0' if gpu is not None else '/cpu:0'):
            self._restorer = checkpoints.restorer()

            # Prepare features for feeding into the model.
            self._ph_decode_length = tf.placeholder(dtype=tf.int32)
            self._ph_infer_inputs = tf.placeholder(dtype=tf.int32)
            self._ph_train_inputs = tf.reshape(tf.placeholder(dtype=tf.int32),
                                               shape=[-1, -1, 1, 1])
            self._ph_train_targets = tf.reshape(tf.placeholder(dtype=tf.int32),
                                                shape=[-1, -1, 1, 1])
            self._ph_learning_rate = tf.placeholder(tf.float32, [],
                                                    name='learning_rate')

            # Prepare the model for training
            self._model = registry.model('transformer')(
                self._checkpoints.hparams, tf.estimator.ModeKeys.TRAIN)

            _, losses = self._model({
                "inputs": self._ph_train_inputs,
                "targets": self._ph_train_targets
            })

            self._loss = losses['training']
            self._train_op = optimize.optimize(
                self._loss,
                self._ph_learning_rate,
                self._model.hparams,
                use_tpu=common_layers.is_on_tpu())

            tf.get_variable_scope().reuse_variables()

            # Prepare the model for infer
            self._attention_mats_op = [
                self._model.attention_weights[
                    'transformer/body/decoder/layer_%i/encdec_attention/multihead_attention/dot_product_attention'
                    % i] for i in xrange(self._model.hparams.num_hidden_layers)
            ]

            self._predictions_ops = []
            infer_inputs = tf.reshape(self._ph_infer_inputs,
                                      [1, -1, 1, 1])  # Make it 4D.
            infer_out = self._model.infer({"inputs": infer_inputs},
                                          beam_size=4,
                                          top_beams=1,
                                          alpha=0.6,
                                          decode_length=self._ph_decode_length)

            self._predictions_op = {
                "outputs": infer_out["outputs"],
                "inputs": infer_inputs,
            }

        session_config = tf.ConfigProto(allow_soft_placement=True)
        session_config.gpu_options.allow_growth = True
        if gpu is not None:
            session_config.gpu_options.force_gpu_compatible = True
            session_config.gpu_options.visible_device_list = str(gpu)

        self._session = tf.Session(config=session_config)

        # Init model
        self._warmup()
Example #18
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
    """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
    assert len(problem_names) == len(hparams.problem_instances)
    decode_hp = decode_hparams
    # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
    dp = devices.data_parallelism()

    tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Add input statistics for incoming features.
    with tf.name_scope("input_stats"):
        for (k, v) in six.iteritems(features):
            if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
                tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
                tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
                nonpadding = tf.to_float(tf.not_equal(v, 0))
                nonpadding_tokens = tf.reduce_sum(nonpadding)
                if k == "targets":
                    targets_nonpadding_tokens = nonpadding_tokens
                tf.summary.scalar("%s_nonpadding_tokens" % k,
                                  nonpadding_tokens)
                tf.summary.scalar("%s_nonpadding_fraction" % k,
                                  tf.reduce_mean(nonpadding))

    # Get multi-problem logits and loss based on features["problem_choice"].
    loss_variable_names = []

    def nth_model(n):
        """Build the model for the n-th problem, plus some added variables."""
        model_class = registry.model(model)(
            hparams,
            mode,
            hparams.problems[n],
            n,
            dp,
            devices.ps_devices(all_workers=True),
            decode_hparams=decode_hparams)
        if mode == tf.estimator.ModeKeys.PREDICT:
            return model_class.infer(features,
                                     beam_size=decode_hp.beam_size,
                                     top_beams=(decode_hp.beam_size if
                                                decode_hp.return_beams else 1),
                                     alpha=decode_hp.alpha,
                                     decode_length=decode_hp.extra_length)
        # In distributed mode, we build graph for problem=0 and problem=worker_id.
        skipping_is_on = hparams.problem_choice == "distributed" and is_training
        problem_worker_id = worker_id % len(hparams.problems)
        skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
        # On worker 0 also build graph for problems <= 1.
        # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
        skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
        if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
            sharded_logits, losses_dict = model_class.eval_autoregressive(
                features)
        else:
            sharded_logits, losses_dict = model_class.model_fn(
                features, skip=(skipping_is_on and skip_this_one))
        with tf.variable_scope("losses_avg"):
            total_loss, ops = 0.0, []
            for loss_key, loss_value in six.iteritems(losses_dict):
                loss_name = "problem_%d/%s_loss" % (n, loss_key)
                loss_moving_avg = tf.get_variable(loss_name,
                                                  initializer=100.0,
                                                  trainable=False)
                loss_variable_names.append(loss_name)
                ops.append(
                    loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                           loss_value * 0.1))
                total_loss += loss_value
            try:  # Total loss avg might be reused or not, we try both.
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    # Total loss was already constructed on input.
                    loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                      n)
            except ValueError:
                loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                                  initializer=100.0,
                                                  trainable=False)
            ops.append(
                loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                       total_loss * 0.1))
        with tf.variable_scope("train_stats"):  # Count steps for this problem.
            problem_steps = tf.get_variable("problem_%d_steps" % n,
                                            initializer=0,
                                            trainable=False)
            ops.append(problem_steps.assign_add(1))
        with tf.control_dependencies(ops):  # Make sure the ops run.
            # Ensure the loss is a scalar here.
            total_loss = tf.reshape(total_loss, [],
                                    name="total_loss_control_id")
        return [total_loss, tf.concat(sharded_logits, 0)]

    model_output = input_fn_builder.cond_on_index(
        nth_model,
        index_tensor=features["problem_choice"],
        max_idx=len(hparams.problems) - 1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # If beam searching, model_output will be a dict with keys "outputs" and
        # "scores".
        if isinstance(model_output, dict):
            outputs = model_output["outputs"]
            scores = model_output["scores"]
        else:
            outputs = model_output
            scores = None

        batched_problem_choice = (features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0], ), dtype=tf.int32))
        predictions = {
            "outputs": outputs,
            "scores": scores,
            "inputs": features.get("inputs", None),
            "firstP": features.get("firstP", None),
            "targets": features.get("infer_targets", None),
            "problem_choice": batched_problem_choice,
        }
        _del_dict_nones(predictions)

        export_out = {"outputs": predictions["outputs"]}
        if "scores" in predictions:
            export_out["scores"] = predictions["scores"]

        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs={
                "output": tf.estimator.export.PredictOutput(export_out)
            })

    total_loss, logits = model_output

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_fns = metrics.create_evaluation_metrics(
            hparams.problem_instances, hparams)

        eval_metrics = {}
        for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
            eval_metrics[metric_name] = metric_fn(logits, features)

        return tf.estimator.EstimatorSpec(mode,
                                          predictions={"predictions": logits},
                                          eval_metric_ops=eval_metrics,
                                          loss=total_loss)

    assert mode == tf.estimator.ModeKeys.TRAIN

    # Set learning rate
    learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
        hparams,
        num_worker_replicas=worker_replicas,
        num_train_steps=train_steps)
    learning_rate /= math.sqrt(float(worker_replicas))

    # Get global step
    global_step = tf.train.get_or_create_global_step()

    # Some training statistics.
    with tf.name_scope("training_stats"):
        tf.summary.scalar("learning_rate", learning_rate)
        for n in xrange(len(hparams.problems)):
            names_and_vars = []
            with tf.variable_scope("losses_avg", reuse=True):
                total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
                names_and_vars.append(("total_loss", total_loss_var))
            with tf.variable_scope("losses_avg", reuse=True):
                for loss_name in loss_variable_names:
                    if loss_name.startswith("problem_%d/" % n):
                        loss_var = tf.get_variable(loss_name)
                        loss_suffix = loss_name[loss_name.index("/") + 1:]
                        names_and_vars.append((loss_suffix, loss_var))
            for (loss_name, loss_var) in names_and_vars:
                tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
            with tf.variable_scope("train_stats", reuse=True):
                nth_steps = tf.get_variable("problem_%d_steps" % n,
                                            dtype=tf.int32)
            tf.summary.scalar(
                "problem_%d_frequency" % n,
                tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0))

    # Add weight decay and noise.
    total_size, weight_decay_loss = 0, 0.0
    delib_params = None
    if hparams.update_delib_only:
        delib_params = [
            v for v in tf.trainable_variables()
            if "delib" in v.name or "softmax" in v.name
        ]
        all_weights = {v.name: v for v in delib_params}
        print("Delib parameters")
        for v in delib_params:
            print("\t\t>>\t\t{}".format(v.name))
    else:
        all_weights = {v.name: v for v in tf.trainable_variables()}
    for v_name in sorted(list(all_weights)):
        v = all_weights[v_name]
        v_size = int(np.prod(np.array(v.shape.as_list())))
        total_size += v_size
        if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
            # Add weight regularization if set and the weight is not a bias (dim>1).
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                v_loss = tf.nn.l2_loss(v) / v_size
            weight_decay_loss += v_loss
        is_body = len(v_name) > 5 and v_name[:5] == "body/"
        if hparams.weight_noise > 0.0 and is_body:
            # Add weight noise if set in hparams.
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                scale = learning_rate * 0.001
                noise = tf.truncated_normal(
                    v.shape) * hparams.weight_noise * scale
                noise_op = v.assign_add(noise)
            with tf.control_dependencies([noise_op]):
                total_loss = tf.identity(total_loss)
    if hparams.weight_decay > 0.0:
        total_loss += weight_decay_loss * hparams.weight_decay

    # The new data reader occasionally emits very small batches, which
    # cause the examples in those batches to be grossly overweighted.
    # We decrease the loss proportionally to the ratio of the size of this
    # batch to the size of the largest training batch ever.
    # TODO(noam): to be more sophisticated, we could keep separate
    # maxima based on problem choice.
    max_nonpadding_var = tf.get_variable("max_nonpadding",
                                         shape=[],
                                         initializer=tf.ones_initializer(),
                                         trainable=False)
    max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
    with tf.control_dependencies(
        [tf.assign(max_nonpadding_var, max_nonpadding)]):
        small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
    tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
    total_loss *= small_batch_multiplier

    # Log variable sizes
    _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
    diet_vars = [
        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
    ]
    _log_variable_sizes(diet_vars, "Diet Variables")

    # Optimize
    train_op = optimize.optimize(total_loss, learning_rate, hparams,
                                 delib_params)

    # Remove summaries that will fail to run because they are in conditionals.
    # TODO(cwhipkey): Test with this code removed, later in 2017.
    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
    for i in reversed(range(len(summaries))):
        if summaries[i].name.startswith("cond_"):
            del summaries[i]

    tf.logging.info("Global model_fn finished.")
    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"problem_choice": features["problem_choice"]},
        loss=total_loss,
        train_op=train_op)
Example #19
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
  """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
  assert len(problem_names) == len(hparams.problem_instances)
  decode_hp = decode_hparams

  # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
  dp = devices.data_parallelism(hparams)

  tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
  is_training = mode == tf.estimator.ModeKeys.TRAIN

  # Add input statistics for incoming features.
  with tf.name_scope("input_stats"):
    for (k, v) in six.iteritems(features):
      if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
        tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
        tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
        nonpadding = tf.to_float(tf.not_equal(v, 0))
        nonpadding_tokens = tf.reduce_sum(nonpadding)
        if k == "targets":
          targets_nonpadding_tokens = nonpadding_tokens
        tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens)
        tf.summary.scalar("%s_nonpadding_fraction" % k,
                          tf.reduce_mean(nonpadding))

  # Get multi-problem logits and loss based on features["problem_choice"].
  loss_variable_names = []

  def nth_model(n):
    """Build the model for the n-th problem, plus some added variables."""
    model_class = registry.model(model)(
        hparams,
        mode,
        hparams.problems[n],
        n,
        dp,
        devices.ps_devices(all_workers=True),
        decode_hparams=decode_hparams)
    if mode == tf.estimator.ModeKeys.PREDICT:
      return model_class.infer(
          features,
          beam_size=decode_hp.beam_size,
          top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1),
          alpha=decode_hp.alpha,
          decode_length=decode_hp.extra_length)
    # In distributed mode, we build graph for problem=0 and problem=worker_id.
    skipping_is_on = hparams.problem_choice == "distributed" and is_training
    problem_worker_id = worker_id % len(hparams.problems)
    skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
    # On worker 0 also build graph for problems <= 1.
    # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
    skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
    if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
      logits, losses_dict = model_class.eval_autoregressive(features)
    else:
      logits, losses_dict = model_class(
          features, skip=(skipping_is_on and skip_this_one))
    with tf.variable_scope("losses_avg"):
      total_loss, ops = 0.0, []
      for loss_key, loss_value in six.iteritems(losses_dict):
        loss_name = "problem_%d/%s_loss" % (n, loss_key)
        loss_moving_avg = tf.get_variable(
            loss_name, initializer=100.0, trainable=False)
        loss_variable_names.append(loss_name)
        ops.append(
            loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
        total_loss += loss_value
      try:  # Total loss avg might be reused or not, we try both.
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
          # Total loss was already constructed on input.
          loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
      except ValueError:
        loss_moving_avg = tf.get_variable(
            "problem_%d/total_loss" % n, initializer=100.0, trainable=False)
      ops.append(
          loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
    with tf.variable_scope("train_stats"):  # Count steps for this problem.
      problem_steps = tf.get_variable(
          "problem_%d_steps" % n, initializer=0, trainable=False)
      ops.append(problem_steps.assign_add(1))
    with tf.control_dependencies(ops):  # Make sure the ops run.
      # Ensure the loss is a scalar here.
      total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
    return [total_loss, logits]

  model_output = input_fn_builder.cond_on_index(
      nth_model,
      index_tensor=features["problem_choice"],
      max_idx=len(hparams.problems) - 1)

  if mode == tf.estimator.ModeKeys.PREDICT:
    # If beam searching, model_output will be a dict with keys "outputs" and
    # "scores".
    if isinstance(model_output, dict):
      outputs = model_output["outputs"]
      scores = model_output["scores"]
    else:
      outputs = model_output
      scores = None

    batched_problem_choice = (
        features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0],), dtype=tf.int32))
    predictions = {
        "outputs": outputs,
        "scores": scores,
        "inputs": features.get("inputs", None),
        "targets": features.get("infer_targets", None),
        "problem_choice": batched_problem_choice,
    }
    _del_dict_nones(predictions)

    export_out = {"outputs": predictions["outputs"]}
    if "scores" in predictions:
      export_out["scores"] = predictions["scores"]

    return tf.estimator.EstimatorSpec(
        mode,
        predictions=predictions,
        export_outputs={
            "output": tf.estimator.export.PredictOutput(export_out)
        })

  total_loss, logits = model_output

  if mode == tf.estimator.ModeKeys.EVAL:
    eval_metrics_fns = metrics.create_evaluation_metrics(
        hparams.problem_instances, hparams)

    eval_metrics = {}
    for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
      eval_metrics[metric_name] = metric_fn(logits, features)

    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"predictions": logits},
        eval_metric_ops=eval_metrics,
        loss=total_loss)

  assert mode == tf.estimator.ModeKeys.TRAIN

  # Set learning rate
  learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
      hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps)
  learning_rate /= math.sqrt(float(worker_replicas))

  # Get global step
  global_step = tf.train.get_or_create_global_step()

  # Some training statistics.
  with tf.name_scope("training_stats"):
    tf.summary.scalar("learning_rate", learning_rate)
    for n in xrange(len(hparams.problems)):
      names_and_vars = []
      with tf.variable_scope("losses_avg", reuse=True):
        total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
        names_and_vars.append(("total_loss", total_loss_var))
      with tf.variable_scope("losses_avg", reuse=True):
        for loss_name in loss_variable_names:
          if loss_name.startswith("problem_%d/" % n):
            loss_var = tf.get_variable(loss_name)
            loss_suffix = loss_name[loss_name.index("/") + 1:]
            names_and_vars.append((loss_suffix, loss_var))
      for (loss_name, loss_var) in names_and_vars:
        tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
      with tf.variable_scope("train_stats", reuse=True):
        nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
      tf.summary.scalar("problem_%d_frequency" % n,
                        tf.to_float(nth_steps) /
                        (tf.to_float(global_step) + 1.0))

  # Add weight decay and noise.
  total_size, weight_decay_loss = 0, 0.0
  all_weights = {v.name: v for v in tf.trainable_variables()}
  for v_name in sorted(list(all_weights)):
    v = all_weights[v_name]
    v_size = int(np.prod(np.array(v.shape.as_list())))
    total_size += v_size
    if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
      # Add weight regularization if set and the weight is not a bias (dim>1).
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        v_loss = tf.nn.l2_loss(v) / v_size
      weight_decay_loss += v_loss
    is_body = len(v_name) > 5 and v_name[:5] == "body/"
    if hparams.weight_noise > 0.0 and is_body:
      # Add weight noise if set in hparams.
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        scale = learning_rate * 0.001
        noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
        noise_op = v.assign_add(noise)
      with tf.control_dependencies([noise_op]):
        total_loss = tf.identity(total_loss)
  if hparams.weight_decay > 0.0:
    total_loss += weight_decay_loss * hparams.weight_decay

  # The new data reader occasionally emits very small batches, which
  # cause the examples in those batches to be grossly overweighted.
  # We decrease the loss proportionally to the ratio of the size of this
  # batch to the size of the largest training batch ever.
  # TODO(noam): to be more sophisticated, we could keep separate
  # maxima based on problem choice.
  max_nonpadding_var = tf.get_variable(
      "max_nonpadding",
      shape=[],
      initializer=tf.ones_initializer(),
      trainable=False)
  max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
  with tf.control_dependencies([tf.assign(max_nonpadding_var, max_nonpadding)]):
    small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
  tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
  total_loss *= small_batch_multiplier

  # Log variable sizes
  _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
  diet_vars = [
      v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
  ]
  _log_variable_sizes(diet_vars, "Diet Variables")

  # Optimize
  train_op = optimize.optimize(total_loss, learning_rate, hparams)

  # Remove summaries that will fail to run because they are in conditionals.
  # TODO(cwhipkey): Test with this code removed, later in 2017.
  summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
  for i in reversed(range(len(summaries))):
    if summaries[i].name.startswith("cond_"):
      del summaries[i]

  tf.logging.info("Global model_fn finished.")
  return tf.estimator.EstimatorSpec(
      mode,
      predictions={"problem_choice": features["problem_choice"]},
      loss=total_loss,
      train_op=train_op)
    def build_model(self):
        # build index table
        index_table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=self.config.vocab_list,
            num_oov_buckets=0,
            default_value=0)

        # get data iterator
        self.data_iterator = self.data.get_data_iterator(index_table,
                                                         mode=self.mode)

        # get inputs
        with tf.variable_scope("inputs"):
            # get next batch if there is no feeded data
            next_batch = self.data_iterator.get_next()
            self.input_queries = tf.placeholder_with_default(
                next_batch["input_queries"], [None, self.config.max_length],
                name="input_queries")
            self.input_replies = tf.placeholder_with_default(
                next_batch["input_replies"], [None, self.config.max_length],
                name="input_replies")
            self.query_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["query_lengths"]), [None],
                name="query_lengths")
            self.reply_lengths = tf.placeholder_with_default(
                tf.squeeze(next_batch["reply_lengths"]), [None],
                name="reply_lengths")

            # get hyperparams
            self.embed_dropout_keep_prob = tf.placeholder(
                tf.float64, name="embed_dropout_keep_prob")
            self.lstm_dropout_keep_prob = tf.placeholder(
                tf.float32, name="lstm_dropout_keep_prob")
            self.dense_dropout_keep_prob = tf.placeholder(
                tf.float32, name="dense_dropout_keep_prob")
            self.num_negative_samples = tf.placeholder(
                tf.int32, name="num_negative_samples")

        with tf.variable_scope("properties"):
            # length properties
            cur_batch_length = tf.shape(self.input_queries)[0]

            # get hparms from tensor2tensor.models.transformer
            hparams = transformer.transformer_small()
            hparams.batch_size = self.config.batch_size
            hparams.learning_rate_decay_steps = 10000
            hparams.learning_rate_minimum = 3e-5

            # learning rate
            lr = learning_rate.learning_rate_schedule(hparams)
            self.learning_rate = lr

        # embedding layer
        with tf.variable_scope("embedding"):
            embeddings = tf.Variable(get_embeddings(
                self.config.vocab_list, self.config.pretrained_embed_dir,
                self.config.vocab_size, self.config.embed_dim),
                                     trainable=True,
                                     name="embeddings")
            embeddings = tf.nn.dropout(
                embeddings,
                keep_prob=self.embed_dropout_keep_prob,
                noise_shape=[tf.shape(embeddings)[0], 1])
            queries_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_queries,
                                       name="queries_embedded"))
            replies_embedded = tf.to_float(
                tf.nn.embedding_lookup(embeddings,
                                       self.input_replies,
                                       name="replies_embedded"))

            self.queries_embedded = queries_embedded
            self.replies_embedded = replies_embedded

        # transformer layer
        with tf.variable_scope("transformer"):
            queries_expanded = tf.expand_dims(queries_embedded,
                                              axis=2,
                                              name="queries_expanded")
            replies_expanded = tf.expand_dims(replies_embedded,
                                              axis=2,
                                              name="replies_expanded")

            hparams = transformer.transformer_small()
            hparams.set_hparam("batch_size", self.config.batch_size)
            hparams.set_hparam("hidden_size", self.config.embed_dim)
            encoder = transformer.TransformerEncoder(hparams, mode=self.mode)

            self.queries_encoded = encoder({
                "inputs": queries_expanded,
                "targets": queries_expanded
            })[0]
            self.replies_encoded = encoder({
                "inputs": replies_expanded,
                "targets": replies_expanded
            })[0]

            self.queries_encoded = tf.squeeze(
                tf.reduce_sum(self.queries_encoded, axis=1, keep_dims=True))
            self.replies_encoded = tf.squeeze(
                tf.reduce_sum(self.replies_encoded, axis=1, keep_dims=True))

        with tf.variable_scope("sampling"):
            positive_mask = tf.eye(cur_batch_length)
            negative_mask = make_negative_mask(
                tf.zeros([cur_batch_length, cur_batch_length]),
                method=self.config.negative_sampling,
                num_negative_samples=self.num_negative_samples)
            negative_queries_indices, negative_replies_indices = tf.split(
                tf.where(tf.not_equal(negative_mask, 0)), [1, 1], 1)

            self.distances = tf.matmul(self.queries_encoded,
                                       self.replies_encoded,
                                       transpose_b=True)
            self.distances_flattened = tf.reshape(self.distances, [-1])

            self.positive_distances = tf.gather(
                self.distances_flattened,
                tf.where(tf.reshape(positive_mask, [-1])))
            self.negative_distances = tf.gather(
                self.distances_flattened,
                tf.where(tf.reshape(negative_mask, [-1])))

            self.negative_queries_indices = tf.squeeze(
                negative_queries_indices)
            self.negative_replies_indices = tf.squeeze(
                negative_replies_indices)

            self.positive_inputs = tf.concat([
                self.queries_encoded, self.positive_distances,
                self.replies_encoded
            ], 1)
            self.negative_inputs = tf.reshape(
                tf.concat([
                    tf.nn.embedding_lookup(self.queries_encoded,
                                           self.negative_queries_indices),
                    self.negative_distances,
                    tf.nn.embedding_lookup(self.replies_encoded,
                                           self.negative_replies_indices)
                ], 1), [
                    tf.shape(negative_queries_indices)[0],
                    self.config.embed_dim * 2 + 1
                ])

        with tf.variable_scope("prediction"):
            self.hidden_outputs = tf.layers.dense(tf.concat(
                [self.positive_inputs, self.negative_inputs], 0),
                                                  256,
                                                  tf.nn.relu,
                                                  name="hidden_layer")
            self.logits = tf.layers.dense(self.hidden_outputs,
                                          2,
                                          tf.nn.relu,
                                          name="output_layer")
            labels = tf.concat([
                tf.ones([tf.shape(self.positive_inputs)[0]], tf.float64),
                tf.zeros([tf.shape(self.negative_inputs)[0]], tf.float64)
            ], 0)

            self.labels = tf.one_hot(tf.to_int32(labels), 2)

            self.probs = tf.sigmoid(self.logits)
            self.predictions = tf.argmax(self.probs, 1)

        with tf.variable_scope("loss"):
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.labels,
                                                           logits=self.logits))
            self.train_step = optimize.optimize(self.loss,
                                                lr,
                                                hparams,
                                                use_tpu=False)

        with tf.variable_scope("score"):
            correct_predictions = tf.equal(self.predictions,
                                           tf.argmax(self.labels, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                   "float"),
                                           name="accuracy")
Example #21
0
 def optimize(self, loss, num_async_replicas=1, use_tpu=False):
   """Return a training op minimizing loss."""
   lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams)
   lr /= math.sqrt(float(num_async_replicas))
   train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)
   return train_op
Example #22
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
    """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
    assert len(problem_names) == len(hparams.problem_instances)
    decode_hp = decode_hparams

    # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
    dp = devices.data_parallelism()

    tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
    # set the initializer functions
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Add input statistics for incoming features.
    with tf.name_scope("input_stats"):
        for (k, v) in six.iteritems(features):
            if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
                tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
                tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
                nonpadding = tf.to_float(tf.not_equal(v, 0))
                nonpadding_tokens = tf.reduce_sum(
                    nonpadding)  # non zeros tokens
                if k == "targets":
                    targets_nonpadding_tokens = nonpadding_tokens
                tf.summary.scalar("%s_nonpadding_tokens" % k,
                                  nonpadding_tokens)
                tf.summary.scalar("%s_nonpadding_fraction" % k,
                                  tf.reduce_mean(nonpadding))

    # Get multi-problem logits and loss based on features["problem_choice"].
    loss_variable_names = []

    def nth_model(n):
        """Build the model for the n-th problem, plus some added variables."""
        model_class = registry.model(model)(
            hparams,
            mode,
            hparams.problems[n],
            n,
            dp,
            devices.ps_devices(all_workers=True),
            decode_hparams=decode_hparams
        )  # initialize transformer model class: hparams, modalities
        if mode == tf.estimator.ModeKeys.PREDICT:
            return model_class.infer(features,
                                     beam_size=decode_hp.beam_size,
                                     top_beams=(decode_hp.beam_size if
                                                decode_hp.return_beams else 1),
                                     alpha=decode_hp.alpha,
                                     decode_length=decode_hp.extra_length)
        # In distributed mode, we build graph for problem=0 and problem=worker_id.
        skipping_is_on = hparams.problem_choice == "distributed" and is_training
        problem_worker_id = worker_id % len(hparams.problems)
        skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
        # On worker 0 also build graph for problems <= 1.
        # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
        skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
        mrt_samples = getattr(hparams, 'mrt_samples', None)
        if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:  # evaluation mode
            sharded_logits, losses_dict = model_class.eval_autoregressive(
                features)
        else:  # training mode
            if hparams.rl:
                # generate sample data, it will automatically sharded, samples shape [batch, time, 1, 1]
                if model_class._num_datashards == 1:  # work on single GPU cards, fast sample
                    print("###Work on Single GPU card, Use Fast Decode.###")
                    train_beam = getattr(hparams, 'train_beam', None)
                    if mrt_samples:
                        samples, _ = model_class._fast_decode(
                            features,
                            decode_length=50,
                            beam_size=mrt_samples,
                            top_beams=mrt_samples)
                        inputs = tf.squeeze(tf.squeeze(features["inputs"],
                                                       axis=-1),
                                            axis=-1)
                        targets = tf.squeeze(tf.squeeze(features["targets"],
                                                        axis=-1),
                                             axis=-1)
                        batch_size = tf.shape(inputs)[0]
                        inputs_len = tf.shape(inputs)[1]
                        targets_len = tf.shape(targets)[1]
                        inputs_tile = tf.tile(inputs, [1, mrt_samples])
                        targets_tile = tf.tile(targets, [1, mrt_samples])
                        inputs_reshape = tf.reshape(
                            inputs_tile,
                            [batch_size * mrt_samples, inputs_len])
                        targets_reshape = tf.reshape(
                            targets_tile,
                            [batch_size * mrt_samples, targets_len])
                        inputs_feed = tf.expand_dims(tf.expand_dims(
                            inputs_reshape, axis=-1),
                                                     axis=-1)
                        targets_feed = tf.expand_dims(tf.expand_dims(
                            targets_reshape, axis=-1),
                                                      axis=-1)
                        features["inputs"] = inputs_feed
                        features["targets"] = targets_feed
                    elif train_beam and train_beam != 1:  # beam search with hparams.train_beam size and return the top 1 sample
                        samples, _ = model_class._fast_decode(
                            features,
                            decode_length=50,
                            beam_size=hparams.train_beam)
                    else:
                        targets_beam = getattr(hparams, 'targets_beam', None)
                        if targets_beam:
                            targets_samples, _ = model_class._fast_decode(
                                features,
                                decode_length=50,
                                beam_size=4,
                                sampling_method='argmax')
                            targets_samples = tf.reshape(
                                targets_samples, [
                                    tf.shape(targets_samples)[0],
                                    tf.shape(targets_samples)[1], 1, 1
                                ])
                            features["targets"] = targets_samples
                        samples, _ = model_class._fast_decode(features,
                                                              decode_length=50)
                    samples = tf.expand_dims(samples, axis=-1)
                    samples = tf.expand_dims(
                        samples, axis=-1
                    )  # add two additional dimensions to make it compatible.
                else:  # work on multi GPU cards, only support slow sample
                    print("###Work on Multi GPU cards, Use Slow Decode.###")
                    samples, _, _ = model_class._slow_greedy_infer(
                        features,
                        decode_length=50)  # default decode_length = 50
                samples = tf.stop_gradient(samples)
                # calculate bleu score use metric_fn
                # train_metric_fn = "approx_bleu_train_score"
                train_metric_fn = metrics.METRICS_FNS[
                    metrics.Metrics.APPROX_BLEU_TRAIN]
                labels = features.get("targets", None)
                samples.set_shape([None, None, 1, 1])
                # haprams.delta_reward = True for delta reward; False for total reward
                metric_value = train_metric_fn(
                    samples, labels, delat_reward=hparams.delta_reward)
                metric_value = tf.stop_gradient(
                    metric_value)  # to be more strict of the gradient
                metric_value.set_shape([None, None, 1, 1])
                """Accodring to the metrics.py: The tf.metrics.mean function assures correct aggregation."""
                # metric_value is total_reward: scalar
                features["samples"] = samples
                features["values"] = metric_value
                # del samples
                # del labels
            sharded_logits, losses_dict = model_class.model_fn(
                features,
                skip=(skipping_is_on and skip_this_one),
                mrt=mrt_samples)
            # if hparams.rl:
            #     training_loss = losses_dict["training"] * metric_value  # losses_dict["training"]: [batch, timesteps]
            #     training_loss_sum = tf.reduce_sum(training_loss)  # sum the training_loss
            #     losses_dict["training"] = training_loss_sum  # log_prob * r (current r is total_reward)
        with tf.variable_scope("losses_avg"):
            total_loss, ops = 0.0, []
            for loss_key, loss_value in six.iteritems(losses_dict):
                if hparams.rl:
                    baseline_loss_weight = getattr(hparams,
                                                   'baseline_loss_weight', 1.0)
                    training_loss_weight = getattr(hparams,
                                                   'training_loss_weight', 1.0)
                    mle_training_loss_weight = getattr(
                        hparams, 'mle_training_loss_weight', 0.3)
                    if loss_key == "training":
                        loss_value = loss_value * training_loss_weight
                    elif loss_key == "training_baseline":
                        loss_value = loss_value * baseline_loss_weight
                    elif loss_key == "mle_training":
                        loss_value = loss_value * mle_training_loss_weight
                loss_name = "problem_%d/%s_loss" % (n, loss_key)
                loss_moving_avg = tf.get_variable(loss_name,
                                                  initializer=100.0,
                                                  trainable=False)
                loss_variable_names.append(loss_name)
                ops.append(
                    loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                           loss_value * 0.1))
                total_loss += loss_value
            try:  # Total loss avg might be reused or not, we try both.
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    # Total loss was already constructed on input.
                    loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                      n)
            except ValueError:
                loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                                  initializer=100.0,
                                                  trainable=False)
            ops.append(
                loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                       total_loss * 0.1))
        with tf.variable_scope("train_stats"):  # Count steps for this problem.
            problem_steps = tf.get_variable("problem_%d_steps" % n,
                                            initializer=0,
                                            trainable=False)
            ops.append(problem_steps.assign_add(1))
        with tf.control_dependencies(ops):  # Make sure the ops run.
            # Ensure the loss is a scalar here.
            total_loss = tf.reshape(total_loss, [],
                                    name="total_loss_control_id")
        return [total_loss, tf.concat(sharded_logits, 0)]

    model_output = input_fn_builder.cond_on_index(
        nth_model,
        index_tensor=features["problem_choice"],
        max_idx=len(hparams.problems) - 1)  # total_loss and shared_logits

    if mode == tf.estimator.ModeKeys.PREDICT:
        # If beam searching, model_output will be a dict with keys "outputs" and
        # "scores".
        if isinstance(model_output, dict):  # beam search
            outputs = model_output["outputs"]
            scores = model_output["scores"]
        else:
            outputs = model_output
            scores = None

        batched_problem_choice = (features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0], ), dtype=tf.int32))
        predictions = {
            "outputs": outputs,
            "scores": scores,
            "inputs": features.get("inputs", None),
            "targets": features.get("infer_targets", None),
            "problem_choice": batched_problem_choice,
        }
        _del_dict_nones(predictions)  # delete the empty ones in predictions

        export_out = {"outputs": predictions["outputs"]}
        if "scores" in predictions:
            export_out["scores"] = predictions["scores"]

        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs={
                "output": tf.estimator.export.PredictOutput(export_out)
            })

    total_loss, logits = model_output

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_fns = metrics.create_evaluation_metrics(
            hparams.problem_instances, hparams)

        eval_metrics = {}
        for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
            eval_metrics[metric_name] = metric_fn(logits, features)

        return tf.estimator.EstimatorSpec(mode,
                                          predictions={"predictions": logits},
                                          eval_metric_ops=eval_metrics,
                                          loss=total_loss)

    assert mode == tf.estimator.ModeKeys.TRAIN

    # Set learning rate
    learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
        hparams,
        num_worker_replicas=worker_replicas,
        num_train_steps=train_steps)
    learning_rate /= math.sqrt(float(worker_replicas))

    # Get global step
    global_step = tf.train.get_or_create_global_step()

    # Some training statistics.
    with tf.name_scope("training_stats"):
        tf.summary.scalar("learning_rate", learning_rate)
        for n in xrange(len(hparams.problems)):
            names_and_vars = []
            with tf.variable_scope("losses_avg", reuse=True):
                total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
                names_and_vars.append(("total_loss", total_loss_var))
            with tf.variable_scope("losses_avg", reuse=True):
                for loss_name in loss_variable_names:
                    if loss_name.startswith("problem_%d/" % n):
                        loss_var = tf.get_variable(loss_name)
                        loss_suffix = loss_name[loss_name.index("/") + 1:]
                        names_and_vars.append((loss_suffix, loss_var))
            for (loss_name, loss_var) in names_and_vars:
                tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
            with tf.variable_scope("train_stats", reuse=True):
                nth_steps = tf.get_variable("problem_%d_steps" % n,
                                            dtype=tf.int32)
            tf.summary.scalar(
                "problem_%d_frequency" % n,
                tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0))

    # Add weight decay and noise.
    total_size, weight_decay_loss = 0, 0.0
    all_weights = {v.name: v for v in tf.trainable_variables()}
    for v_name in sorted(list(all_weights)):
        v = all_weights[v_name]
        v_size = int(np.prod(np.array(v.shape.as_list())))
        total_size += v_size
        if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
            # Add weight regularization if set and the weight is not a bias (dim>1).
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                v_loss = tf.nn.l2_loss(v) / v_size
            weight_decay_loss += v_loss
        is_body = len(v_name) > 5 and v_name[:5] == "body/"
        if hparams.weight_noise > 0.0 and is_body:
            # Add weight noise if set in hparams.
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                scale = learning_rate * 0.001
                noise = tf.truncated_normal(
                    v.shape) * hparams.weight_noise * scale
                noise_op = v.assign_add(noise)
            with tf.control_dependencies([noise_op]):
                total_loss = tf.identity(total_loss)
    if hparams.weight_decay > 0.0:
        total_loss += weight_decay_loss * hparams.weight_decay

    # The new data reader occasionally emits very small batches, which
    # cause the examples in those batches to be grossly overweighted.
    # We decrease the loss proportionally to the ratio of the size of this
    # batch to the size of the largest training batch ever.
    # TODO(noam): to be more sophisticated, we could keep separate
    # maxima based on problem choice.
    max_nonpadding_var = tf.get_variable("max_nonpadding",
                                         shape=[],
                                         initializer=tf.ones_initializer(),
                                         trainable=False)
    max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
    with tf.control_dependencies(
        [tf.assign(max_nonpadding_var, max_nonpadding)]):
        small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
    tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
    total_loss *= small_batch_multiplier

    # Log variable sizes
    _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
    diet_vars = [
        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
    ]
    _log_variable_sizes(diet_vars, "Diet Variables")

    # Optimize
    train_op = optimize.optimize(total_loss, learning_rate, hparams)

    # Remove summaries that will fail to run because they are in conditionals.
    # TODO(cwhipkey): Test with this code removed, later in 2017.
    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
    for i in reversed(range(len(summaries))):
        if summaries[i].name.startswith("cond_"):
            del summaries[i]

    tf.logging.info("Global model_fn finished.")
    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"problem_choice": features["problem_choice"]},
        loss=total_loss,
        train_op=train_op)