def train_step(inputs): input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, target_ids, draft_mask, refine_mask, grad_accum_flag = inputs with tf.GradientTape() as tape: (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = model( input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, True ) train_variables = model.trainable_variables draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask) refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, refine_mask) loss = draft_summary_loss + refine_summary_loss loss = tf.reduce_mean(loss) #loss = optimizer.get_scaled_loss(loss) gradients = tape.gradient(loss, train_variables) #gradients = optimizer.get_unscaled_gradients(gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target_ids_[:, :-1], refine_predictions) return (loss,target_ids_[:, :-1], refine_predictions)
def train_step(inp, tar, grad_accum_flag): with tf.GradientTape() as tape: draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model( inp, tar, training=True) refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model( inp, tar, training=True) train_variables = draft_summary_model.trainable_variables + refine_summary_model.trainable_variables draft_summary_loss = loss_function(tar[0][:, 1:, :], draft_predictions) refine_summary_loss = loss_function(tar[0][:, :-1, :], refine_predictions) loss = draft_summary_loss + refine_summary_loss scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign( tf.math.divide(accumulator, h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(tar[0][:, 1:, :], draft_predictions) train_accuracy(tar[0][:, :-1, :], refine_predictions)
def val_step(input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, target_ids, draft_mask, refine_mask, step, create_summ): (draft_predictions, draft_attention_weights, refine_predictions, refine_attention_weights) = model( input_ids, input_mask, input_segment_ids, target_ids_, target_mask, target_segment_ids, False ) draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, draft_mask) refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, refine_mask) loss = draft_summary_loss + refine_summary_loss loss = tf.reduce_mean(loss) validation_loss(loss) validation_accuracy(target_ids_[:, :-1], refine_predictions) if create_summ: rouge, bert = tf_write_summary(target_ids_[:, :-1], refine_predictions, step) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def val_step(inp, tar, epoch, create_summ): target_ids_, target_mask, target_segment_ids = tar mask = tf.math.logical_not(tf.math.equal(target_ids_[:, 1:], 0)) target_ids = label_smoothing( tf.one_hot(target_ids_, depth=config.input_vocab_size)) draft_predictions, draft_attention_weights, draft_dec_output, refine_predictions, refine_attention_weights, refine_dec_output = model( inp, tar, training=False) draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, mask) refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, mask) loss = draft_summary_loss + refine_summary_loss validation_loss(loss) validation_accuracy(tar_real, predictions)
def train_step(inp, tar, grad_accum_flag): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = model( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = model.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign(tf.math.divide(accumulator,h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(tar_real, predictions)
def train_step(inp, tar, inp_shape, tar_shape, batch): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) train_variables = transformer.trainable_variables tf.debugging.check_numerics(predictions, "Nan's in the transformer predictions") if config.copy_gen: predictions = pointer_generator(dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=True) tf.debugging.check_numerics( predictions, "Nan's in the pointer_generator predictions") train_variables = train_variables + pointer_generator.trainable_variables loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, train_variables) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def val_step(inp, tar, inp_shape, tar_shape, batch): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) predictions, attention_weights, dec_output = transformer( inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask ) if config.copy_gen: predictions = pointer_generator( dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=False ) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, inp_shape, tar_shape, batch, create_summ): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks( inp, tar_inp) predictions, attention_weights, dec_output = transformer( inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask) if config.copy_gen: predictions = pointer_generator(dec_output, predictions, attention_weights, inp, inp_shape, tar_shape, batch, training=False) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions) if create_summ: rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def val_step(inp, tar, epoch, create_summ): draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model( inp, tar, training=False) refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model( inp, tar, training=False) draft_summary_loss = loss_function(tar[0][:, 1:, :], draft_predictions) refine_summary_loss = loss_function(tar[0][:, :-1, :], refine_predictions) loss = draft_summary_loss + refine_summary_loss validation_loss(loss) validation_accuracy(tar_real, predictions) if create_summ: rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def train_step(inp, tar_inp, tar_real): enc_padding_mask, combined_mask, dec_padding_mask = utils.create_masks( inp, tar_inp) # shape(inp) = (batch_size, pad_size) # shape(predictions) = (batch_size, pad_size, tar_vocab_size) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = metrics.loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss)
def train_step(input_ids, input_mask, input_segment_ids, target_ids, target_mask, target_segment_ids, grad_accum_flag): inp = input_ids, input_mask, input_segment_ids tar = target_ids, target_mask, target_segment_ids target_ids_ = target_ids mask = tf.math.logical_not(tf.math.equal(target_ids_[:, 1:], 0)) target_ids = label_smoothing( tf.one_hot(target_ids_, depth=config.input_vocab_size)) with tf.GradientTape() as tape: draft_predictions, draft_attention_weights, draft_dec_output, refine_predictions, refine_attention_weights, refine_dec_output = model( inp, tar, True) train_variables = model.trainable_variables draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, mask) refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions, mask) loss = draft_summary_loss + refine_summary_loss #scaled_loss = optimizer.get_scaled_loss(loss) gradients = tape.gradient(loss, train_variables) #gradients = optimizer.get_unscaled_gradients(scaled_gradients) # Initialize the shadow variables with same type as the gradients if not accumulators: for tv in gradients: accumulators.append(tf.Variable(tf.zeros_like(tv), trainable=False)) # accmulate the gradients to the shadow variables for (accumulator, grad) in zip(accumulators, gradients): accumulator.assign_add(grad) # apply the gradients and reset them to zero if the flag is true if grad_accum_flag: for accumlator in accumulators: accumulator.assign( tf.math.divide(accumulator, h_parms.accumulation_steps)) optimizer.apply_gradients(zip(accumulators, train_variables)) for accumulator in (accumulators): accumulator.assign(tf.zeros_like(accumulator)) train_loss(loss) train_accuracy(target_ids_[:, 1:], draft_predictions) train_accuracy(target_ids_[:, :-1], refine_predictions)
def train_step(inp, tar, grad_accum_flag): accumulators = [] tar_inp = tar[:, :-1] tar_real = tar[:, 1:] translated_output_temp, tape = beam_search_eval(inp, tar_real, h_parms.train_beam_size) predictions = translated_output_temp[-1][:,0,:] #print(tar_real.shape, predictions.shape) train_variables = model.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) train_loss(loss) train_accuracy(tar_real, predictions)
def train_step(images, word_target): # word_target shape(bs, max_txt_length, vocab_size) loss = 0 hidden = tf.zeros((BATCH_SIZE, decode_units)) word_one_hot = word_target[:, 0, :] # corresponding the word 'START' with tf.GradientTape() as tape: # Teacher forcing - feeding the target as the next input for i in range(1, word_target.shape[1]): y_pred, hidden = model(word_one_hot, hidden, images) word_one_hot = word_target[:, i, :] loss += loss_function(word_target[:, i, :], y_pred) batch_loss = loss / int(word_target.shape[1]) variables = model.trainable_variables gradients = tape.gradient(loss, variables) optimizer.apply_gradients(zip(gradients, variables)) return batch_loss
def val_step(inp, tar, epoch, create_summ): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) predictions, attention_weights, dec_output = model( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=False ) loss = loss_function(tar_real, predictions) validation_loss(loss) validation_accuracy(tar_real, predictions) if create_summ: rouge, bert = tf_write_summary(tar_real, predictions, inp[:, 1:], epoch) else: rouge, bert = (1.0, 1.0) return (rouge, bert)
def train(X, Y, act_fun, act_fun_back, architecture, loss_metric, learning_rate, epochs, metrics_period): layers = len(architecture) params = init_params(architecture) iterations = 0 for epoch in range(epochs): for example_idx in range(len(X)): x = algebra.Vector(X[example_idx]) y = algebra.Vector(Y[example_idx]) y_hat, layer_outputs = propagation.net_forward_prop( layers, x, params, act_fun) output_gradient = propagation.output_gradient(y, y_hat) param_gradients = propagation.net_back_prop( layers, layer_outputs, output_gradient, params, act_fun_back) update_params(layers, params, param_gradients, learning_rate) iterations += 1 # Metrics if iterations % metrics_period == 0: m_y_hat_list = [] for m_idx in range(len(X)): m_x = algebra.Vector(X[m_idx]) m_y_hat, _ = propagation.net_forward_prop( layers, m_x, params, act_fun) m_y_hat_list.append(m_y_hat.vector) loss = metrics.loss_function(m_y_hat_list, Y, loss_metric) accuracy = metrics.accuracy(m_y_hat_list, Y) print( 'Epoch: {}\tIter: {}k\t\tLoss: {}\t\tAccuracy: {}'.format( epoch + 1, iterations / 1000, loss, accuracy)) memory['layers'] = layers memory['params'] = params memory['act_fun'] = act_fun
def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, attention_weights, dec_output = transformer( inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask, training=True ) train_variables = transformer.trainable_variables loss = loss_function(tar_real, predictions) scaled_loss = optimizer.get_scaled_loss(loss) scaled_gradients = tape.gradient(scaled_loss, train_variables) gradients = optimizer.get_unscaled_gradients(scaled_gradients) optimizer.apply_gradients(zip(gradients, train_variables)) train_loss(loss) train_accuracy(tar_real, predictions)
def val_step(inp, tar, epoch, create_summ): target_ids_, target_mask, target_segment_ids = tar mask = tf.math.logical_not(tf.math.equal(target_ids_[:, 1:], 0)) #target_ids = tf.one_hot(target_ids, config.input_vocab_size) target_ids = label_smoothing( tf.one_hot(target_ids_, depth=config.input_vocab_size)) draft_predictions, draft_attention_weights, draft_dec_output = draft_summary_model( inp, tar, training=False) # refine_predictions, refine_attention_weights, refine_dec_output = refine_summary_model( # inp, # tar, # training=False # ) draft_summary_loss = loss_function(target_ids[:, 1:, :], draft_predictions, mask) #refine_summary_loss = loss_function(target_ids[:, :-1, :], refine_predictions) loss = draft_summary_loss #+ refine_summary_loss validation_loss(loss) validation_accuracy(target_ids_[:, 1:], draft_predictions) # if create_summ: # rouge, bert = tf_write_summary(tar_real, draft_predictions, inp[0][:, 1:], epoch) # else: # rouge, bert = (1.0, 1.0) return (rouge, bert)
for n, data in enumerate(train_loader): im, gt_points, gt_normals = data if use_cuda: im = im.cuda() gt_points = gt_points.cuda() gt_normals = gt_normals.cuda() # Forward graph.reset() optimizer.zero_grad() pool = FeaturePooling(im) pred_points = model_gcn(graph, pool) # Loss loss = loss_function(pred_points, gt_points.squeeze(), gt_normals.squeeze(), graph) # Backward loss.backward() optimizer.step() curr_loss += loss # Log if (n+1)%log_step == 0: print("Epoch", epoch) print("Batch", n+1) print(" Loss:", curr_loss.data.item()/log_step) curr_loss = 0 # Save
if use_cuda: ims = ims.cuda() gt_points_list = gt_points_list.cuda() gt_normals_list = gt_normals_list.cuda() # Forward graph.reset() optimizer.zero_grad() pools = [] for i in range(5): pools.append(FeaturePooling(ims[i])) pred_points = model_gcn(graph, pools) # Loss loss = loss_function(pred_points, gt_points_list[0].squeeze(), gt_normals_list[0].squeeze(), graph) # Backward loss.backward() optimizer.step() curr_loss += loss # Log if (n+1)%log_step == 0: print("Epoch", epoch, flush=True) print("Batch", n+1, flush=True) print(" Loss:", curr_loss.data.item()/log_step, flush=True) curr_loss = 0 # Save