def eval_step(input_ids, target_ids, ): target_inp = target_ids[:, :-1] _, combined_mask, dec_padding_mask = create_masks(input_ids, target_inp) (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = Model( input_ids, dec_padding_mask=dec_padding_mask, target_ids=target_inp, look_ahead_mask=combined_mask, training=False ) loss, target = loss_function(target_ids, draft_predictions, refine_predictions, Model ) train_loss(loss) log.info(Model.summary()) if config.save_initial_weights: initial_weights = os.path.join(config.initial_weights,'initial_weights') Model.save_weights(initial_weights) return loss
def eval_step(input_ids, target_ids_, target_ids, draft_mask, refine_mask): (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = Model(input_ids, target_ids_, False) draft_output_sequence_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask) if config.use_refine_decoder: refine_output_sequence_loss = loss_function(target_ids[:, :-1, :], refine_predictions, refine_mask) else: refine_output_sequence_loss = 0 regularization_loss = tf.add_n(Model.losses) loss = draft_output_sequence_loss + refine_output_sequence_loss + regularization_loss log.info(Model.summary()) if config.save_initial_weights: initial_weights = os.path.join(config.initial_weights, 'initial_weights') Model.save_weights(initial_weights) return loss
def train_step(input_ids, target_ids_, target_ids, draft_mask, refine_mask, grad_accum_flag): with tf.GradientTape() as tape: (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = Model(input_ids, target_ids_, True) train_variables = Model.trainable_variables draft_output_sequence_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask) if config.use_refine_decoder: refine_output_sequence_loss = loss_function( target_ids[:, :-1, :], refine_predictions, refine_mask) predictions = refine_predictions target = target_ids_[:, :-1] else: refine_output_sequence_loss = 0 predictions = draft_predictions target = target_ids_[:, 1:] regularization_loss = tf.add_n(Model.losses) loss = draft_output_sequence_loss + refine_output_sequence_loss + regularization_loss scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not gradient_accumulators: for tv in gradients: gradient_accumulators.append( tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(gradient_accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: optimizer.apply_gradients(zip(gradient_accumulators, train_variables)) for accumulator in (gradient_accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target, predictions) return predictions
def train_step(input_ids, target_ids, grad_accum_flag): _, combined_mask, dec_padding_mask = create_masks( input_ids, target_ids[:, :-1] ) with tf.GradientTape() as tape: (draft_logits, refine_logits, draft_attention_weights, refine_attention_weights, candidate_returns, sample_returns) = Model( input_ids, dec_padding_mask=dec_padding_mask, target_ids=target_ids, look_ahead_mask=combined_mask, training=True, ) train_variables = Model.trainable_variables loss, bert_f1_score = loss_function(target_ids, draft_logits, refine_logits, candidate_returns, sample_returns ) regularization_loss = tf.add_n(Model.losses) total_loss = tf.reduce_sum([loss, regularization_loss]) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) if config.accumulate_gradients: # Initialize the shadow variables with same type as the gradients if not gradient_accumulators: for tv in gradients: gradient_accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(gradient_accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: optimizer.apply_gradients(zip(gradient_accumulators, train_variables)) for accumulator in (gradient_accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) else: optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) return refine_logits, bert_f1_score
def train_step(input_ids, target_ids, grad_accum_flag): target_inp = target_ids[:, :-1] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( input_ids, target_inp) with tf.GradientTape() as tape: (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = Model( input_ids, dec_padding_mask=dec_padding_mask, target_ids=target_inp, enc_padding_mask=enc_padding_mask, look_ahead_mask=combined_mask, training=True, ) train_variables = Model.trainable_variables loss, target = loss_function(target_ids, draft_predictions, refine_predictions, Model) predictions = refine_predictions if refine_predictions is not None else draft_predictions scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) if config.accumulate_gradients: # Initialize the shadow variables with same type as the gradients if not gradient_accumulators: for tv in gradients: gradient_accumulators.append( tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(gradient_accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: optimizer.apply_gradients( zip(gradient_accumulators, train_variables)) for accumulator in (gradient_accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target, predictions) else: optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(target, predictions) return predictions