def train_step(inputs): input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, target_ids, draft_mask, refine_mask, grad_accum_flag = inputs with tf.GradientTape() as tape: (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = model( input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, True ) train_variables = model.trainable_variables draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask) refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, refine_mask) loss = draft_summary_loss + refine_summary_loss loss = tf.reduce_mean(loss) #loss = optimizer.get_scaled_loss(loss) gradients = tape.gradient(loss, train_variables) #gradients = optimizer.get_unscaled_gradients(gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target_ids_[:, :-1], refine_predictions) return (loss,target_ids_[:, :-1], refine_predictions)
def train_step(inp, tar, inp_shape, tar_shape, batch): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) train_variables = transformer.trainable_variables tf.debugging.check_numerics(predictions, "Nan's in the transformer predictions") if config.copy_gen: predictions = pointer_generator(dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=True) tf.debugging.check_numerics( predictions, "Nan's in the pointer_generator predictions") train_variables = train_variables + pointer_generator.trainable_variables loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, train_variables) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def train_step(inp, tar, grad_accum_flag): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = model( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = model.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign(tf.math.divide(accumulator,h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(tar_real, predictions)
def train_step(inp, tar, grad_accum_flag): with tf.GradientTape() as tape: draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model( inp, tar, training=True) refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model( inp, tar, training=True) train_variables = draft_summary_model.trainable_variables + refine_summary_model.trainable_variables draft_summary_loss = loss_function(tar[0][:, 1:, :], draft_predictions) refine_summary_loss = loss_function(tar[0][:, :-1, :], refine_predictions) loss = draft_summary_loss + refine_summary_loss scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign( tf.math.divide(accumulator, h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(tar[0][:, 1:, :], draft_predictions) train_accuracy(tar[0][:, :-1, :], refine_predictions)
def grad_accum(gradients, optimizer): # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign( tf.math.divide(accumulator, h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator))
def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = transformer.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def train_step(inp, tar, grad_accum_flag): target_ids_, target_mask, target_segment_ids = tar mask = tf.math.logical_not(tf.math.equal(target_ids_[:, 1:], 0)) target_ids = label_smoothing( tf.one_hot(target_ids_, depth=config.input_vocab_size)) with tf.GradientTape() as tape: draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model( inp, tar, training=True) # refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model( # inp, # tar, # training=True # ) train_variables = draft_summary_model.trainable_variables #+ refine_summary_model.trainable_variables draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, mask) #refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions) loss = draft_summary_loss #+ refine_summary_loss #scaled_loss = optimizer.get_scaled_loss(loss) gradients = tape.gradient(loss, train_variables) #gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign( tf.math.divide(accumulator, h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target_ids_[:, 1:], draft_predictions)