Example #1
0
    def train_step(inputs):
      input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, target_ids, draft_mask, refine_mask, grad_accum_flag = inputs
      with tf.GradientTape() as tape:

        (draft_predictions, draft_attention_weights,
          refine_predictions, refine_attention_weights) = model(
                                                               input_ids, input_mask, input_segment_ids,
                                                               target_ids_, target_mask, target_segment_ids,
                                                               True
                                                                   )
        train_variables = model.trainable_variables
        draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask)
        refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, refine_mask)
        loss = draft_summary_loss + refine_summary_loss
        loss = tf.reduce_mean(loss)
        #loss = optimizer.get_scaled_loss(loss)
      gradients  = tape.gradient(loss, train_variables)
      #gradients = optimizer.get_unscaled_gradients(gradients)
      # Initialize the shadow variables with same type as the gradients
      if not accumulators:
        for tv in gradients:
          accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False))
      # accmulate the gradients to the shadow variables
      for (accumulator, grad) in zip(accumulators, gradients):
        accumulator.assign_add(grad)
      # apply the gradients and reset them to zero if the flag is true

      if grad_accum_flag:
        optimizer.apply_gradients(zip(accumulators, train_variables))
        for accumulator in (accumulators):
            accumulator.assign(tf.zeros_like(accumulator))

        train_loss(loss)
        train_accuracy(target_ids_[:, :-1], refine_predictions)
      return (loss,target_ids_[:, :-1], refine_predictions)
def train_step(inp, tar, inp_shape, tar_shape, batch):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        inp, tar_inp)
    with tf.GradientTape() as tape:
        predictions, attention_weights, dec_output = transformer(
            inp, tar_inp, True, enc_padding_mask, combined_mask,
            dec_padding_mask)
        train_variables = transformer.trainable_variables
        tf.debugging.check_numerics(predictions,
                                    "Nan's in the transformer predictions")
        if config.copy_gen:
            predictions = pointer_generator(dec_output,
                                            predictions,
                                            attention_weights,
                                            inp,
                                            inp_shape,
                                            tar_shape,
                                            batch,
                                            training=True)
            tf.debugging.check_numerics(
                predictions, "Nan's in the pointer_generator predictions")
        train_variables = train_variables + pointer_generator.trainable_variables
        loss = loss_function(tar_real, predictions)
    gradients = tape.gradient(loss, train_variables)
    optimizer.apply_gradients(zip(gradients, train_variables))
    train_loss(loss)
    train_accuracy(tar_real, predictions)
Example #3
0
def train_step(inp, tar, grad_accum_flag):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  with tf.GradientTape() as tape:
    predictions, attention_weights, dec_output = model(
                                                       inp, 
                                                       tar_inp, 
                                                       enc_padding_mask, 
                                                       combined_mask, 
                                                       dec_padding_mask,
                                                       training=True
                                                       )
    train_variables = model.trainable_variables
    loss = loss_function(tar_real, predictions)
    scaled_loss = optimizer.get_scaled_loss(loss)
  scaled_gradients  = tape.gradient(scaled_loss, train_variables)
  gradients = optimizer.get_unscaled_gradients(scaled_gradients)
  # Initialize the shadow variables with same type as the gradients 
  if not accumulators:
    for tv in gradients:
      accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False))
  # accmulate the gradients to the shadow variables
  for (accumulator, grad) in zip(accumulators, gradients):
    accumulator.assign_add(grad)
  # apply the gradients and reset them to zero if the flag is true
  if grad_accum_flag:
    for accumlator in accumulators:
      accumulator.assign(tf.math.divide(accumulator,h_parms.accumulation_steps))
    optimizer.apply_gradients(zip(accumulators, train_variables))
    for accumulator in (accumulators):
        accumulator.assign(tf.zeros_like(accumulator))
  train_loss(loss)
  train_accuracy(tar_real, predictions)  
Example #4
0
def train_step(inp, tar, grad_accum_flag):
    with tf.GradientTape() as tape:
        draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model(
            inp, tar, training=True)
        refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model(
            inp, tar, training=True)
        train_variables = draft_summary_model.trainable_variables + refine_summary_model.trainable_variables
        draft_summary_loss = loss_function(tar[0][:, 1:, :], draft_predictions)
        refine_summary_loss = loss_function(tar[0][:, :-1, :],
                                            refine_predictions)
        loss = draft_summary_loss + refine_summary_loss
        scaled_loss = optimizer.get_scaled_loss(loss)
    scaled_gradients = tape.gradient(scaled_loss, train_variables)
    gradients = optimizer.get_unscaled_gradients(scaled_gradients)
    # Initialize the shadow variables with same type as the gradients
    if not accumulators:
        for tv in gradients:
            accumulators.append(tf.Variable(tf.zeros_like(tv),
                                            trainable=False))
    # accmulate the gradients to the shadow variables
    for (accumulator, grad) in zip(accumulators, gradients):
        accumulator.assign_add(grad)
    # apply the gradients and reset them to zero if the flag is true
    if grad_accum_flag:
        for accumlator in accumulators:
            accumulator.assign(
                tf.math.divide(accumulator, h_parms.accumulation_steps))
        optimizer.apply_gradients(zip(accumulators, train_variables))
        for accumulator in (accumulators):
            accumulator.assign(tf.zeros_like(accumulator))
    train_loss(loss)
    train_accuracy(tar[0][:, 1:, :], draft_predictions)
    train_accuracy(tar[0][:, :-1, :], refine_predictions)
def grad_accum(gradients, optimizer):
    # Initialize the shadow variables with same type as the gradients
    if not accumulators:
        for tv in gradients:
            accumulators.append(tf.Variable(tf.zeros_like(tv),
                                            trainable=False))
    # accmulate the gradients to the shadow variables
    for (accumulator, grad) in zip(accumulators, gradients):
        accumulator.assign_add(grad)
    # apply the gradients and reset them to zero if the flag is true
    if grad_accum_flag:
        for accumlator in accumulators:
            accumulator.assign(
                tf.math.divide(accumulator, h_parms.accumulation_steps))
        optimizer.apply_gradients(zip(accumulators, train_variables))
        for accumulator in (accumulators):
            accumulator.assign(tf.zeros_like(accumulator))
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  with tf.GradientTape() as tape:
    predictions, attention_weights, dec_output = transformer(
                                                             inp, 
                                                             tar_inp, 
                                                             enc_padding_mask, 
                                                             combined_mask, 
                                                             dec_padding_mask,
                                                             training=True
                                                             )
    train_variables = transformer.trainable_variables
    loss = loss_function(tar_real, predictions)
    scaled_loss = optimizer.get_scaled_loss(loss)
  scaled_gradients = tape.gradient(scaled_loss, train_variables)
  gradients = optimizer.get_unscaled_gradients(scaled_gradients)
  optimizer.apply_gradients(zip(gradients, train_variables))
  train_loss(loss)
  train_accuracy(tar_real, predictions)  
Example #7
0
def train_step(inp, tar, grad_accum_flag):
    target_ids_, target_mask, target_segment_ids = tar
    mask = tf.math.logical_not(tf.math.equal(target_ids_[:, 1:], 0))
    target_ids = label_smoothing(
        tf.one_hot(target_ids_, depth=config.input_vocab_size))
    with tf.GradientTape() as tape:
        draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model(
            inp, tar, training=True)
        # refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model(
        #                                                                                    inp,
        #                                                                                    tar,
        #                                                                                    training=True
        #                                                                                    )
        train_variables = draft_summary_model.trainable_variables  #+ refine_summary_model.trainable_variables
        draft_summary_loss = loss_function(target_ids[:, 1:, :],
                                           draft_predictions, mask)
        #refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions)
        loss = draft_summary_loss  #+ refine_summary_loss
        #scaled_loss = optimizer.get_scaled_loss(loss)
    gradients = tape.gradient(loss, train_variables)
    #gradients = optimizer.get_unscaled_gradients(scaled_gradients)
    # Initialize the shadow variables with same type as the gradients
    if not accumulators:
        for tv in gradients:
            accumulators.append(tf.Variable(tf.zeros_like(tv),
                                            trainable=False))
    # accmulate the gradients to the shadow variables
    for (accumulator, grad) in zip(accumulators, gradients):
        accumulator.assign_add(grad)
    # apply the gradients and reset them to zero if the flag is true
    if grad_accum_flag:
        for accumlator in accumulators:
            accumulator.assign(
                tf.math.divide(accumulator, h_parms.accumulation_steps))
        optimizer.apply_gradients(zip(accumulators, train_variables))
        for accumulator in (accumulators):
            accumulator.assign(tf.zeros_like(accumulator))
    train_loss(loss)
    train_accuracy(target_ids_[:, 1:], draft_predictions)