def train( data_layer, data_layer_eval, model, ctc_loss, greedy_decoder, optimizer, optim_level, labels, multi_gpu, args, fn_lr_policy=None): """Trains model Args: data_layer: training data layer data_layer_eval: evaluation data layer model: model ( encapsulates data processing, encoder, decoder) ctc_loss: loss function greedy_decoder: greedy ctc decoder optimizer: optimizer optim_level: AMP optimization level labels: list of output labels multi_gpu: true if multi gpu training args: script input argument list fn_lr_policy: learning rate adjustment function """ def eval(): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e)) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss)) print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer)) print_once("Starting .....") start_time = time.time() train_dataloader = data_layer.data_iterator epoch = args.start_epoch step = epoch * args.step_per_epoch while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in train_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) if batch_counter == 0: if fn_lr_policy is not None: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors model.train() t_log_probs_t, t_encoded_len_t = model(x=(t_audio_signal_t, t_a_sig_length_t)) t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t) if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if optim_level in AmpOptimizations: with amp.scale_loss(t_loss_t, optimizer) as scaled_loss: scaled_loss.backward() else: t_loss_t.backward() batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if step % args.train_frequency == 0: t_predictions_t = greedy_decoder(log_probs=t_log_probs_t) e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) if step > 0 and step % args.eval_frequency == 0: print_once("Doing Evaluation ....................... ...... ... .. . .") eval() step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break print_once("Done in {0}".format(time.time() - start_time)) print_once("Final Evaluation ....................... ...... ... .. . .") eval() save(model, optimizer, epoch, output_dir=args.output_dir)
def train( data_layer, model, loss_fn, greedy_decoder, optimizer, optim_level, labels, multi_gpu, data_transforms, args, evalutaion, logger, fn_lr_policy): """Trains model Args: data_layer: training data layer model: model ( encapsulates data processing, encoder, decoder) loss_fn: loss function greedy_decoder: greedy ctc decoder optimizer: optimizer optim_level: AMP optimization level labels: list of output labels multi_gpu: true if multi gpu training args: script input argument list fn_lr_policy: function returning lr in given step """ print_once("Starting .....") start_time = time.time() train_dataloader = data_layer.data_iterator epoch = args.start_epoch step = epoch * args.step_per_epoch start_step = step if args.ipex: print("is ipex") if args.bf16: print("is bf16") print("running bfloat16 training step\n") elif args.fp32: print("running fp32 training step\n") total_time = 0 while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in tqdm(train_dataloader): if batch_counter == 0: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data) model.train() if args.profiling and (step - start_step) >= args.warmup: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: if (step - start_step) >= args.warmup: t0 = time.perf_counter() if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) if args.bf16: t_log_probs_t = t_log_probs_t.to(torch.float32) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) else: if (step - start_step) >= args.warmup: t0 = time.perf_counter() if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) if args.bf16: t_log_probs_t = t_log_probs_t.to(torch.float32) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if (step + 1) % args.train_frequency == 0: # t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t) # e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] # train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) # logger.log_scalar('wer', train_wer, step) step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break # evalutaion(epoch) if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break if args.profiling: print(prof.key_averages().table(sort_by="self_cpu_time_total")) print_once("Done in {0}".format(time.time() - start_time)) if args.num_steps is not None: total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size else: total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size print("total samples tested: ", total_samples) print("Model training time:", total_time, "s") perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf)) # print_once("Final Evaluation ....................... ...... ... .. . .") # evalutaion() save(model, optimizer, epoch, output_dir=args.output_dir) else: total_time = 0 while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in train_dataloader: if batch_counter == 0: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data) model.train() if (step - start_step) >= args.warmup: t0 = time.perf_counter() t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) print(t_loss_t) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if (step + 1) % args.train_frequency == 0: t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t) e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) logger.log_scalar('wer', train_wer, step) step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break # evalutaion(epoch) if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break print_once("Done in {0}".format(time.time() - start_time)) if args.num_steps is not None: total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size else: total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size print("total samples tested: ", total_samples) print("Model training time:", total_time, "s") perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf)) # print_once("Final Evaluation ....................... ...... ... .. . .") # evalutaion() save(model, optimizer, epoch, output_dir=args.output_dir)