Esempio n. 1
0
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps,
                     use_tpu):
    # AdafactorOptimizer.beta1 = 0.0
    # AdafactorOptimizer.clipping_threshold = 1.0
    # AdafactorOptimizer.decay_rate = None
    # AdafactorOptimizer.epsilon1 = 1e-30
    # AdafactorOptimizer.epsilon2 = 0.001
    # AdafactorOptimizer.factored = True
    # AdafactorOptimizer.min_dim_size_to_factor = 128
    # AdafactorOptimizer.multiply_by_parameter_scale = True

    global_step = tf.train.get_or_create_global_step()

    optimizer = adafactor.AdafactorOptimizer(
        multiply_by_parameter_scale=True,
        learning_rate=init_lr,
        decay_rate=None,
        beta1=0.0,
        clipping_threshold=1.0,
        factored=True,
        epsilon1=1e-30,
        epsilon2=0.001,
    )

    if use_tpu:
        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)

    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    return train_op
Esempio n. 2
0
def adafactor(learning_rate, hparams):
    try:
        from tensor2tensor.utils import adafactor as af
    except ImportError:
        print(("Adafactor requires the tensor2tensor library."
               "Please run 'pip install tensor2tensor' to get Adafactor."))
        raise ImportError

    del learning_rate
    del hparams
    return af.AdafactorOptimizer()
Esempio n. 3
0
    def testCallableLearningRate(self):
        def lr():
            return 0.01

        opt = adafactor.AdafactorOptimizer(learning_rate=lr)
        v1 = tf.Variable([1., 2.])
        v2 = tf.Variable([3., 4.])
        with tf.GradientTape() as tape:
            tape.watch([v1, v2])
            loss = v1 * v2
        v1_grad, v2_grad = tape.gradient(loss, [v1, v2])
        opt.apply_gradients(((v1_grad, v1), (v2_grad, v2)))
Esempio n. 4
0
def get_optimizer(params, learning_rate):
    """Gets the optimzer based on the hparams and current mode (TPU vs. CPU/GPU).

  Args:
      params: A dictionary containing training hyperparameters.
      learning_rate: A float32 scalar.

  Returns:
    A string or an optimizer instance.
  """
    optimizer = None

    if params['optimizer'] == 'Adafactor':
        try:
            from tensor2tensor.utils import (
                adafactor, )  # pylint: disable=g-import-not-at-top

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=learning_rate)
        except ImportError:
            logging.error('tensor2tensor not installed. Cannot use Adafactor.'
                          'Defaulting to Adam.')
            params['optimizer'] = 'Adam'

    if params['optimizer'] == 'Adam':
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate,
            beta1=params['optimizer_beta1'],
            beta2=params['optimizer_beta2'],
            epsilon=params['optimizer_epsilon'],
        )

    if params['optimizer'] == 'AdamWeightDecay':
        optimizer = AdamWeightDecayOptimizer(
            learning_rate,
            weight_decay_rate=params['weight_decay_rate'],
            beta_1=params['optimizer_beta1'],
            beta_2=params['optimizer_beta2'],
            epsilon=params['optimizer_epsilon'],
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
        )

    if params['optimizer'] == 'SGD':
        optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)

    if optimizer is None:
        raise ValueError('Unknown optimizer: {}.'.format(params['optimizer']))

    if params['use_tpu']:
        # Average the gradients across TPU cores.
        optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)

    return optimizer
Esempio n. 5
0
    def __init__(self, params, learning_rate):
        """Gets the optimzer based on the hparams and current mode (TPU vs. CPU/GPU).

    Args:
        params: A dictionary containing training hyperparameters.
        learning_rate: A float32 scalar.

    Returns:
      A string or an optimizer instance.
    """
        optimizer = None

        if params["optimizer"] == "Adafactor":
            try:
                from tensor2tensor.utils import adafactor  # pylint: disable=g-import-not-at-top
                optimizer = adafactor.AdafactorOptimizer(
                    learning_rate=learning_rate)
            except ImportError:
                logging.error(
                    "tensor2tensor not installed. Cannot use Adafactor."
                    "Defaulting to Adam.")
                params["optimizer"] = "Adam"

        if params["optimizer"] == "Adam":
            optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate,
                beta1=params["optimizer_beta1"],
                beta2=params["optimizer_beta2"],
                epsilon=params["optimizer_epsilon"])

        if params["optimizer"] == "AdamWeightDecay":  # AdamWeightDecay 사용
            optimizer = AdamWeightDecayOptimizer(
                learning_rate,
                weight_decay_rate=params["weight_decay_rate"],  # 0.01
                beta_1=params["optimizer_beta1"],  # 0.9
                beta_2=params["optimizer_beta2"],  # 0.999
                epsilon=params["optimizer_epsilon"],  # 1e-06
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        if params["optimizer"] == "SGD":
            optimizer = tf.compat.v1.train.GradientDescentOptimizer(
                learning_rate)

        if optimizer is None:
            raise ValueError("Unknown optimizer: {}.".format(
                params["optimizer"]))

        if params["use_tpu"]:
            # Average the gradients across TPU cores.
            optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)

        self.optimizer = optimizer
Esempio n. 6
0
    def model_fn(features, labels, mode, config, params):
        """Estimator model function."""

        # Not sure why it does this?
        del labels
        del config
        del params

        tf.get_variable_scope().set_initializer(
            tf.variance_scaling_initializer(1.0,
                                            mode="fan_avg",
                                            distribution="uniform"))

        # PREDICTION (e.g. evaluate)
        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions, _, _ = model_params.estimator_prediction_fn(features)

            if include_features_in_predictions:
                predictions.update(features)

            if decode_keys:
                # Decode the raw ids into strings in prediction.
                def decode_host_call(tensor_dict):
                    for key in decode_keys:
                        predictions[key] = public_parsing_ops.decode(
                            tensor_dict[key], model_params.vocab_filename,
                            model_params.encoder_type)
                    return tensor_dict

                contrib_tpu.outside_compilation(decode_host_call, predictions)
            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  predictions=predictions)

        # TRAINING
        training = mode == tf.estimator.ModeKeys.TRAIN
        # use_tpu is false by default so this skips
        if use_tpu and model_params.use_bfloat16:
            with contrib_tpu.bfloat16_scope():
                loss, outputs = model_params.model()(features, training)
        else:
            XENT_loss, outputs = model_params.model()(features, training)
            # XENT_loss, outputs = model_params.model().double_sampling(features, training, model_params.batch_size,
            #                                                           features["targets"].get_shape().as_list()[1],
            #                                                           mixed=True)

        # TPU requires outputs all have batch dimension and doesn't handle scalar.
        # Tile all scalars to 1 dimension vector.
        outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size)

        # Create optimizer and define learning rate
        if mode == tf.estimator.ModeKeys.TRAIN:
            init_lr = model_params.learning_rate
            global_step = tf.train.get_global_step()
            lr = init_lr / 0.01 * tf.rsqrt(
                tf.maximum(tf.to_float(global_step), 10000))
            if train_init_checkpoint:
                lr = tf.minimum(
                    tf.to_float(global_step + 1) / train_warmup_steps *
                    init_lr, lr)

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0)
            if use_tpu:
                optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

            ###############################################################################################################
            ##### VARIABLES ###############################################################################################
            # Create index tensors to stack and get corresponding probabilities from logp
            # max_seq_len = outputs["targets"].get_shape().as_list()[1]
            # sequence_index = tf.constant(np.arange(0, max_seq_len))
            # batch_index = tf.constant(np.zeros(sequence_index.get_shape().as_list()[0]), dtype=tf.int64)

            ##### I.I.D SAMPLING ##########################################################################################
            """ Here we sample the tokens that are produced by teacher forcing. """
            # Normalise logits to log-prob, and compute Gumbel samples with location
            # logit_probs = tf.math.softmax(outputs["logits"], axis=2)  # should not be x <= 0
            # clipped_logit_probs = tf.clip_by_value(logit_probs, 1e-8, 1.0)
            # logp = tf.log(clipped_logit_probs)

            # RETURNS TEACHER FORCING SAMPLED TOKEN VARIATIONS
            # argmax_logp_index, soft_logp_index, topk_out, z = iid_sampling(logp, max_seq_len, greedy=True, soft=False,
            #                                                                topk=False, k=2)
            # topk_probs, topk_indices = topk_out
            # TEST SAMPLING METHODS PROVIDED BY PEGASUS
            # sampled_BxT = iid_process_logits(outputs["logits"], max_seq_len, model_params.batch_size,
            #                                  outputs["logits"].get_shape().as_list()[-1],
            #                                  top_k=0, top_p=0.9, temperature=1.0)

            ##### DECODER SAMPLING ########################################################################################
            """ Here we sample the tokens using the decoder. Beam size == 1. 
            PREDS: IDs
            LOGP: transformed logits
            SCORE: scalar score using RISK trick
            LOGP: [BxTxV] beam logp
            LOGITS: [BxTxV] beam logits
            the dictionary contains the following keys: {ids, logp_BxT, sent_score, logp_BxTxV}
      # Note: the logp_BxTxV are analogous to z -> should be used for RELAX, preds are the BxT of these -> b=H(z), and
      # logp are the corresponding values (score is normalised to sentence score).
      """
            # greedy_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 0.0}
            # random_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.0, "temperature": 1.0}
            # topk_beam_params = {"_beam": 3, "top_k": 10000, "top_p": 0.0, "temperature": 1.0}
            # topp_beam_params = {"_beam": 3, "top_k": 0, "top_p": 0.9, "temperature": 1.0}

            # greedy_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                                 beam_params=greedy_beam_params, sentence_score=False)
            # random_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                                 beam_params=random_beam_params, sentence_score=False)
            # topk_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                               beam_params=topk_beam_params, sentence_score=False)
            # topp_dict = non_beam_sampling(model_params, features, max_seq_len,
            #                               beam_params=topp_beam_params, sentence_score=False)

            # BEAM SEARCH
            # greedy_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                             beam_params=greedy_beam_params)
            # random_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                             beam_params=random_beam_params)
            # topk_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                           beam_params=topk_beam_params)
            # topp_dict = beam_sampling(model_params, features, max_seq_len, batch_index, sequence_index,
            #                           beam_params=topp_beam_params)

            ##### RELAX VARIABLES #########################################################################################
            """ Here we create the variables for RELAX. Pass in the logp, logits, and z that has already been 
      sampled/created from manipulation. Will return z_tilde [BxTxV] and logp(b) [BxT]. """
            # TEACHER FORCING SAMPLING
            # z_tilde, logp_b = create_variables(z, logp, batch_index, sequence_index, clipped_logit_probs)

            # DECODER SAMPLING -> sample_b is already argmaxed in decode loop
            # z_tilde, logp_b = create_variables_from_samples(random_dict["logits_BxTxV"], random_dict["logp_BxTxV"],
            #                                                 random_dict["ids"], batch_index, sequence_index)

            ##### TEXT AND ROUGE ##########################################################################################
            """ Here we first convert sequences to text, and calculate corresponding rouge scores/losses. """
            # target_text = rouge_decoding(outputs["targets"], model_params)  # TARGET SAMPLES
            # argmax_pred_text = rouge_decoding(argmax_logp_index, model_params)  # ARGMAX SAMPLES
            # soft_pred_text = rouge_decoding(soft_logp_index, model_params)  # SOFTMAX SAMPLES
            # additional_pred_text = rouge_decoding(sampled_BxT, model_params)  # ADDITIONAL SAMPLES

            # Token-level ROUGE
            # ROUGE_token = tf.py_function(rouge_token,(outputs["targets"], random_dict["ids"], 0, 0), tf.float32)

            # CALCULATE ROUGE LOSS: ROUGE score -> ROUGE loss = -ROUGE score
            # NOTE: for ROUGE variant, change value (0: precision, 1: recall, 2: f1)
            # rouge_loss_argmax = -tf.py_function(evaluate_rl, (target_text, argmax_pred_text, 2), tf.float32)
            # rouge_loss_soft = -tf.py_function(evaluate_rl, (target_text, soft_pred_text, 2), tf.float32)
            # rouge_loss_extra = -tf.py_function(evaluate_rl, (target_text, additional_pred_text, 2), tf.float32)

            ##### REINFORCE LOSS ##########################################################################################
            """ Calculate standard REINFORCE loss. Can be document-level (score using RISK trick), or token-level [BxT]. """
            # FIND CORRESPONDING LOG_PROBS OF THE I.I.D SAMPLED TOKENS
            # ARGMAX -> logp(argmax(y))
            # argmax_logp = iid_log_probs(argmax_logp_index, batch_index, sequence_index, logp)
            # SOFTMAX -> logp(sample_y)
            # softmax_logp = iid_log_probs(soft_logp_index, batch_index, sequence_index, logp)
            # ADDITIONAL
            # additional_logp = iid_log_probs(sampled_BxT, batch_index, sequence_index, logp)

            # CHANGE BELOW IF USING DECODER SAMPLED TOKENS/SCORES
            # weight the logp by ROUGE score (neg ROUGE_loss), sum values
            # reinforce_loss = tf.reduce_sum(tf.multiply(rouge_loss_argmax, argmax_logp))

            ##### REINFORCE w/ BASELINE ###################################################################################
            """ Calculate RwB using Socher's loss function (2017). Optional: use a Q_func as baseline. """
            # improve the probs of the SOFT labels (soft - hard)*soft_logp
            # improve the probs of the HARD labels (hard - soft)*hard_logp

            # BASELINE: CONTROL VARIATE
            # ffn_output = control_variate(source, targets)
            # with tf.variable_scope("Q_func"):
            #   cv = rwb_Q_func(tf.reshape(softmax_logp, [1, 32]), tf.reshape(additional_logp, [1, 32]))

            # cv_loss = tf.reduce_mean(tf.square(tf.subtract(rouge_loss_argmax, cv)))

            # loss_difference = tf.subtract(rouge_loss_soft, rouge_loss_argmax)
            # reinforce_baseline = tf.reduce_sum(tf.multiply(loss_difference, softmax_logp))

            # BASELINE: HINGE LOSS
            # rouge_soft = -rouge_loss_soft
            # rouge_hard = -rouge_loss_argmax
            # hinge = -tf.maximum((rouge_soft - rouge_hard), 0)
            # hinge_baseline = tf.reduce_sum(tf.multiply(hinge, softmax_logp))

            ##### REINFORCE w/ THRESHOLD ##################################################################################
            """ Calculate REINFORCE with a constant threshold as the baseline. """
            # we take output of ROUGE score as ROUGE_loss = -ROUGE score
            # intermediate_loss = tf.reduce_sum(tf.multiply(tf.subtract(0.3, -rouge_loss_argmax), argmax_logp))

            ##### EXPECTED RISK MINIMISATION ##############################################################################
            """ Calculate the RISK loss using n sequences from sampling process. """
            # L_risk = risk_loss(model_params.batch_size, max_seq_len,
            #                    rouge_losses=[rouge_loss_argmax, rouge_loss_soft, rouge_loss_extra],
            #                    logps=[topk_dict["logp1"], topk_dict["logp2"], topk_dict["logp3"]], n=3)

            ##### MIXED LOSS ##############################################################################################
            """ Implement a mixed loss function that is weighted by an alpha term. """
            # combined_loss = tf.math.add(tf.multiply(tf.constant(0.3, dtype=tf.float32), XENT_loss),
            #                             tf.multiply(tf.constant(0.7, dtype=tf.float32), L_risk))

            # OR conditional loss switch
            # constraint = tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
            # combined_loss = tf.cond(constraint > 0.8, lambda: hard_reinforce_loss, lambda: XENT_loss)

            ##### RELAX CONTROL VARIATE ###################################################################################
            """ Prepare the target sequence for use in the control variate. """
            # z = random_dict["logp_BxTxV"]
            # z_target, zt_target = create_cv_target(outputs, batch_index, sequence_index, z, z_tilde)

            ##### RELAX LOSS ##############################################################################################
            """ Manipulate z and z_tilde using the Q_func to mimic ROUGE loss. """
            # with tf.variable_scope("Q_func"):
            #     c_z = Q_func(z, z_target)

            # with tf.variable_scope("Q_func", reuse=True):
            #     c_z_tilde = Q_func(z_tilde, zt_target)

            # Formulate RELAX as a loss function
            # f_y = rouge_loss_soft  # negative for loss (defined above)
            # c_z_tilde1 = tf.stop_gradient(tf.identity(c_z_tilde))  # clone, detach, stop grad
            # L_relax = tf.reduce_sum(((f_y - c_z_tilde1)*logp_b) - c_z_tilde + c_z)

            # OR construct gradient estimator
            # theta = [tv for tv in tf.trainable_variables() if "Q_func" not in tv.name]
            # d_logp_d_theta = tf.gradients(logp_b, theta)[0]  # logp
            # d_c_z_tilde_d_theta = tf.gradients(c_z_tilde, theta)[0]
            # d_c_z_d_theta = tf.gradients(c_z, theta)[0]
            # relax = tf.reduce_sum(f_y - c_z_tilde)*d_logp_d_theta - d_c_z_tilde_d_theta + d_c_z_d_theta

            # relax = tf.gradients(L_relax, theta)[0]

            # Calculate the first optimization step with loss
            # list_of_gradient_variable_pairs = optimizer.compute_gradients(L_relax)
            # train_op = optimizer.apply_gradients(list_of_gradient_variable_pairs, global_step=global_step)

            # Variance reduction objective
            # variance_loss = tf.reduce_mean(tf.square(relax), name="variance_loss")

            # initialise adafactor again for variance optimiser
            # var_opt = adafactor.AdafactorOptimizer(
            #           learning_rate=lr,
            #           decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
            #           beta1=0.0)

            # est_params = [eta, log_temperature]  # TODO: REBAR implementation

            # Adds the parameters of the FFNN
            # nn_params = [tv for tv in tf.trainable_variables() if "Q_func" in tv.name]
            # est_params = nn_params
            # est_params = est_params + nn_params  # TODO: REBAR implementation

            # Additional optimization step
            # var_gradvars = var_opt.compute_gradients(variance_loss, var_list=est_params)
            # var_train_op = var_opt.apply_gradients(var_gradvars)

            # This may allow for both train ops to be passed in the return statement below?
            # with tf.control_dependencies([train_op, var_train_op]):
            #     train_op = tf.no_op()

            ###############################################################################################################
            # Calculate gradients
            # If freezing layers, only optimise wrt certain layers (find names) - speeds up, worsens performance
            # last_params = [tv for tv in tf.trainable_variables() if "decoder/LayerNorm/" in tv.name]
            # list_of_gradient_variable_pairs = optimizer.compute_gradients(combined_loss, var_list=last_params)

            list_of_gradient_variable_pairs = optimizer.compute_gradients(
                XENT_loss)
            train_op = optimizer.apply_gradients(
                list_of_gradient_variable_pairs, global_step=global_step)

            tf.logging.set_verbosity(tf.logging.INFO)
            # Debugging steps - add into logging hook directly if needed
            # tf.debugging.check_numerics(sum_logp, "DEBUG: sum_logp has a NaN")

            logging_hook = tf.train.LoggingTensorHook(
                {
                    "loss": XENT_loss,
                    # "variance_loss": variance_loss,
                    # "cv_loss": cv_loss,
                    "learning_rate": lr,
                    "global_step": global_step,
                },
                every_n_iter=5)

            # This is the configured estimator function that is returned to train the model
            return tpu_estimator.TPUEstimatorSpec(
                mode=mode,
                loss=XENT_loss,
                train_op=train_op,
                training_hooks=[logging_hook],
                scaffold_fn=_load_vars_from_checkpoint(use_tpu,
                                                       train_init_checkpoint),
                host_call=add_scalars_to_summary(
                    model_dir,
                    {
                        "learning_rate": lr,
                        # "rouge_loss_hard": rouge_loss_argmax,
                        # "rouge_loss_soft": rouge_loss_soft,
                        # "rouge_loss_extra": rouge_loss_extra,
                        # "reinforce_loss": reinforce_loss,
                        # "risk_loss": L_risk,
                        # "XENT_loss": XENT_loss,
                    }))

        # EVALUATION (evaluating the performance)
        if mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics = model_params.estimator_eval_metrics_fn(
                features, outputs)
            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  loss=XENT_loss,
                                                  eval_metrics=eval_metrics)
  def model_fn(features, labels, mode, config, params):
    """Estimator model function."""

    del labels
    del config
    del params

    tf.compat.v1.get_variable_scope().set_initializer(
        tf.compat.v1.variance_scaling_initializer(
            1.0, mode="fan_avg", distribution="uniform"))

    if mode == tf.estimator.ModeKeys.PREDICT:
      predictions = model_params.estimator_prediction_fn(features)

      if include_features_in_predictions:
        predictions.update(features)

      if decode_keys:
        # Decode the raw ids into strings in prediction.
        def decode_host_call(tensor_dict):
          for key in decode_keys:
            predictions[key] = public_parsing_ops.decode(
                tensor_dict[key], model_params.vocab_filename,
                model_params.encoder_type)
          return tensor_dict

        contrib_tpu.outside_compilation(decode_host_call, predictions)
      return contrib_tpu.TPUEstimatorSpec(mode=mode, predictions=predictions)

    training = mode == tf.estimator.ModeKeys.TRAIN
    if use_tpu and model_params.use_bfloat16:
      with contrib_tpu.bfloat16_scope():
        loss, outputs = model_params.model()(features, training)
    else:
      loss, outputs = model_params.model()(features, training)

    # TPU requires ouputs all have batch dimension and doesn't handle scalar.
    # Tile all scalars to 1 dimension vector.
    outputs = _tile_scalar_to_batch_size(outputs, model_params.batch_size)

    if mode == tf.estimator.ModeKeys.TRAIN:
      init_lr = model_params.learning_rate
      global_step = tf.compat.v1.train.get_global_step()
      lr = init_lr / 0.01 * tf.math.rsqrt(
          tf.maximum(tf.cast(global_step, dtype=tf.float32), 10000))
      if train_init_checkpoint:
        lr = tf.minimum(
            tf.cast(global_step + 1, dtype=tf.float32) / train_warmup_steps * init_lr, lr)

      optimizer = adafactor.AdafactorOptimizer(
          learning_rate=lr,
          decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
          beta1=0.0)
      if use_tpu:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
      train_op = optimizer.minimize(loss, global_step=global_step)

      return tpu_estimator.TPUEstimatorSpec(
          mode=mode,
          loss=loss,
          train_op=train_op,
          scaffold_fn=_load_vars_from_checkpoint(use_tpu,
                                                 train_init_checkpoint),
          host_call=add_scalars_to_summary(model_dir, {"learning_rate": lr}))
    if mode == tf.estimator.ModeKeys.EVAL:
      eval_metrics = model_params.estimator_eval_metrics_fn(features, outputs)
      return tpu_estimator.TPUEstimatorSpec(
          mode=mode, loss=loss, eval_metrics=eval_metrics)
Esempio n. 8
0
    def model_fn(features, labels, mode, params):
        tf.logging.info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info('  name = %s, shape = %s' %
                            (name, features[name].shape))

        input_ids = features['input_ids']
        target_ids = features['target_ids']
        masked_lm_positions = features['masked_lm_positions']
        masked_lm_ids = features['masked_lm_ids']
        masked_lm_weights = features['masked_lm_weights']

        is_training = mode == tf.estimator.ModeKeys.TRAIN

        model = transformer.TransformerEncoderDecoderModel(
            vocab_size,
            hidden_size,
            filter_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            label_smoothing,
            dropout,
        )

        loss, outputs = model({
            'inputs': input_ids,
            'targets': target_ids
        },
                              training=is_training)

        # (
        #     masked_lm_loss,
        #     masked_lm_example_loss,
        #     masked_lm_log_probs,
        # ) = get_masked_lm_output(
        #     model._context['memory'],
        #     model._embedding_layer.weights_VxD,
        #     masked_lm_positions,
        #     masked_lm_ids,
        #     masked_lm_weights,
        # )

        total_loss = loss

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (
                assignment_map,
                initialized_variable_names,
            ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info('**** Trainable Variables ****')
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info('  name = %s, shape = %s%s', var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            init_lr = learning_rate
            global_step = tf.train.get_global_step()
            lr = (init_lr / 0.01 *
                  tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000)))

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0,
            )
            if use_tpu:
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
            train_op = optimizer.minimize(loss, global_step=global_step)

            # global_step = tf.train.get_global_step()
            # lr = learning_rate_schedule_noam(
            #     global_step,
            #     total_train_steps = num_train_steps,
            #     warmup_steps = num_warmup_steps,
            # )

            # optimizer = adafactor.AdafactorOptimizer(
            #     learning_rate = lr, beta1 = 0.0
            # )
            # if use_tpu:
            #     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            # train_op = optimizer.minimize(loss, global_step = global_step)

            train_op = optimization.create_optimizer(
                total_loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                use_tpu,
            )

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn,
            )
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(
                masked_lm_example_loss,
                masked_lm_log_probs,
                masked_lm_ids,
                masked_lm_weights,
            ):
                """Computes the loss and accuracy of the model."""
                masked_lm_log_probs = tf.reshape(
                    masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]])
                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights,
                )
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                return {
                    'masked_lm_accuracy': masked_lm_accuracy,
                    'masked_lm_loss': masked_lm_mean_loss,
                }

            eval_metrics = (
                metric_fn,
                [
                    masked_lm_example_loss,
                    masked_lm_log_probs,
                    masked_lm_ids,
                    masked_lm_weights,
                ],
            )
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn,
            )
        else:
            raise ValueError('Only TRAIN and EVAL modes are supported: %s' %
                             (mode))

        return output_spec
Esempio n. 9
0
def model_fn(features, labels, mode, params):
    X = features['inputs']
    Y = features['targets']

    is_training = mode == tf.estimator.ModeKeys.TRAIN

    model = Model(X, Y)
    student = StudentModel(X, Y)

    student_logits = student.logits[:, :, 0, 0]
    student_task_xent, weights = padded_cross_entropy_loss(
        student_logits, student.Y)

    teacher_targets = tf.nn.softmax(model.logits[:, :, 0, 0] /
                                    distill_temperature)
    student_distill_xent = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=tf.stop_gradient(teacher_targets),
        logits=student_logits / distill_temperature,
    )
    student_distill_xent = tf.reduce_sum(student_distill_xent * weights)
    student_distill_xent *= distill_temperature**2

    phase_loss = task_balance * student_task_xent
    phase_loss += (1 - task_balance) * student_distill_xent

    loss = phase_loss / tf.reduce_sum(weights)
    task_loss = student_task_xent / tf.reduce_sum(weights)
    distill_loss = student_distill_xent / tf.reduce_sum(weights)

    tf.identity(loss, 'total_loss')
    tf.identity(task_loss, 'task_loss')
    tf.identity(distill_loss, 'distill_loss')

    tf.summary.scalar('total_loss', loss)
    tf.summary.scalar('task_loss', task_loss)
    tf.summary.scalar('distill_loss', distill_loss)

    tvars = [v for v in tf.trainable_variables() if 'student/' not in v.name]

    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
        (
            assignment_map,
            initialized_variable_names,
        ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_or_create_global_step()
        lr = tf.rsqrt(tf.maximum(tf.to_float(global_step), num_warmup_steps))
        optimizer = adafactor.AdafactorOptimizer(learning_rate=lr, beta1=0.0)
        train_op = optimizer.minimize(loss, global_step=global_step)
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss)

    return estimator_spec
Esempio n. 10
0
    def model_fn(features, labels, mode, params):
        tf.logging.info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info('  name = %s, shape = %s' %
                            (name, features[name].shape))

        inputs = features['inputs']
        targets = features['targets']

        is_training = mode == tf.estimator.ModeKeys.TRAIN

        model = modeling.TransformerModel(bert_config)
        (llh, logits, pred_ids), _ = model(inputs,
                                           target_ids=targets,
                                           training=is_training)

        total_loss = padded_cross_entropy_loss(
            logits,
            targets,
            bert_config['label_smoothing'],
            bert_config['vocab_size'],
        )

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (
                assignment_map,
                initialized_variable_names,
            ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info('**** Trainable Variables ****')
        print(initialized_variable_names)
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info('  name = %s, shape = %s%s', var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            init_lr = learning_rate
            global_step = tf.train.get_global_step()
            lr = (init_lr / 0.01 *
                  tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000)))

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0,
            )
            if use_tpu:
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            train_op = optimizer.minimize(total_loss, global_step=global_step)

            # if not bert_config['use_bias']:
            #     logging.info('Fixing position embedding, i.e. not trainable.')
            #     posemb = 'pegasus/embeddings/position_embeddings'
            #     tvars = list(
            #         filter(lambda v: v.name.split(':')[0] != posemb, tvars)
            #     )

            # gradients = optimizer.compute_gradients(total_loss, tvars)

            # train_op = optimization.create_optimizer(
            #     total_loss,
            #     learning_rate,
            #     num_train_steps,
            #     num_warmup_steps,
            #     use_tpu,
            # )

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn,
            )
        elif mode == tf.estimator.ModeKeys.EVAL:

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=None,
                scaffold_fn=scaffold_fn,
            )
        else:
            raise ValueError('Only TRAIN and EVAL modes are supported: %s' %
                             (mode))

        return output_spec
Esempio n. 11
0
    def model_fn(features, labels, mode, params):
        tf.logging.info('*** Features ***')
        for name in sorted(features.keys()):
            tf.logging.info('  name = %s, shape = %s' %
                            (name, features[name].shape))

        inputs = features['input_ids']
        targets = features['target_ids']

        is_training = mode == tf.estimator.ModeKeys.TRAIN

        model = transformer.TransformerEncoderDecoderModel(
            vocab_size,
            hidden_size,
            filter_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            label_smoothing,
            dropout,
        )

        loss, outputs = model({
            'inputs': inputs,
            'targets': targets
        },
                              training=is_training)

        total_loss = loss

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (
                assignment_map,
                initialized_variable_names,
            ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info('**** Trainable Variables ****')
        for var in tvars:
            init_string = ''
            if var.name in initialized_variable_names:
                init_string = ', *INIT_FROM_CKPT*'
            tf.logging.info('  name = %s, shape = %s%s', var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            init_lr = learning_rate
            global_step = tf.train.get_global_step()
            lr = (init_lr / 0.01 *
                  tf.rsqrt(tf.maximum(tf.to_float(global_step), 10000)))

            optimizer = adafactor.AdafactorOptimizer(
                learning_rate=lr,
                decay_rate=adafactor.adafactor_decay_rate_pow(0.8),
                beta1=0.0,
            )
            if use_tpu:
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

            train_op = optimizer.minimize(loss, global_step=global_step)

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                scaffold_fn=scaffold_fn,
            )
        elif mode == tf.estimator.ModeKeys.EVAL:

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, scaffold_fn=scaffold_fn)
        else:
            raise ValueError('Only TRAIN and EVAL modes are supported: %s' %
                             (mode))

        return output_spec