def save(model, ema_model, optimizer, epoch, output_dir, optim_level): """ Saves model checkpoint Args: model: model ema_model: model with exponential averages of weights optimizer: optimizer epoch: epoch of model training output_dir: path to save model checkpoint """ out_fpath = os.path.join(output_dir, f"Jasper_epoch{epoch}_checkpoint.pt") print_once(f"Saving {out_fpath}...") if torch.distributed.is_initialized(): torch.distributed.barrier() rank = torch.distributed.get_rank() else: rank = 0 if rank == 0: checkpoint = { 'epoch': epoch, 'state_dict': getattr(model, 'module', model).state_dict(), 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict() if optim_level > 0 else None, } if ema_model is not None: checkpoint['ema_state_dict'] = getattr(ema_model, 'module', ema_model).state_dict() torch.save(checkpoint, out_fpath) print_once('Saved.')
def eval(model, name=''): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() if optim_level == 1: with amp.disable_casts(): t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) else: t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) if jasper_encoder.use_conv_mask: t_log_probs_e, t_encoded_len_e = model.forward( (t_processed_signal_e, t_processed_sig_length_e)) else: t_log_probs_e = model.forward(t_processed_signal_e) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict(loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e]) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) if name != '': name = '_' + name print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n") print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n")
def save(model, optimizer, epoch, output_dir): """ Saves model checkpoint Args: model: model optimizer: optimizer epoch: epoch of model training output_dir: path to save model checkpoint """ class_name = model.__class__.__name__ unix_time = time.time() file_name = "{0}_{1}-epoch-{2}.pt".format(class_name, unix_time, epoch) print_once("Saving module {0} in {1}".format(class_name, os.path.join(output_dir, file_name))) if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)): model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self save_checkpoint={ 'epoch': epoch, 'state_dict': model_to_save.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(save_checkpoint, os.path.join(output_dir, file_name)) print_once('Saved.')
def eval(): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e)) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) print_once("==========>>>>>>Evaluation Loss: {0}\n".format(eloss)) print_once("==========>>>>>>Evaluation WER: {0}\n".format(wer))
def train( data_layer, data_layer_eval, model, ctc_loss, greedy_decoder, optimizer, optim_level, labels, multi_gpu, args, fn_lr_policy=None): """Trains model Args: data_layer: training data layer data_layer_eval: evaluation data layer model: model ( encapsulates data processing, encoder, decoder) ctc_loss: loss function greedy_decoder: greedy ctc decoder optimizer: optimizer optim_level: AMP optimization level labels: list of output labels multi_gpu: true if multi gpu training args: script input argument list fn_lr_policy: learning rate adjustment function """ def eval(): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() t_log_probs_e, t_encoded_len_e = model(x=(t_audio_signal_e, t_a_sig_length_e)) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results #lnw modified for cer #wer, eloss = process_evaluation_epoch(_global_var_dict) wer, eloss, cer = process_evaluation_epoch2(_global_var_dict) print_once("==========>>>>>>Evaluation Loss: {0}".format(eloss)) print_once("==========>>>>>>Evaluation WER: {0}".format(wer)) #lnw add for cer print_once("==========>>>>>>Evaluation CER: {0}".format(cer)) #lnw add print("Evaluation end time : "+str(datetime.now())) print_once("Starting .....") start_time = time.time() train_dataloader = data_layer.data_iterator epoch = args.start_epoch step = epoch * args.step_per_epoch while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) #lnw add lEpochStart_time = datetime.now() print("Epoch Start time : "+str(lEpochStart_time)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in train_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) if batch_counter == 0: if fn_lr_policy is not None: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors model.train() t_log_probs_t, t_encoded_len_t = model(x=(t_audio_signal_t, t_a_sig_length_t)) t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t) if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if optim_level in AmpOptimizations: with amp.scale_loss(t_loss_t, optimizer) as scaled_loss: scaled_loss.backward() else: t_loss_t.backward() batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if step % args.train_frequency == 0: t_predictions_t = greedy_decoder(log_probs=t_log_probs_t) e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] #lnw modified for cer #train_wer = monitor_asr_train_progress(e_tensors, labels=labels) train_wer,train_cer = monitor_asr_train_progress2(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) #lnw add for print wer cer print_once("==========>>>>>>Train WER: {0}".format(train_wer)) print_once("==========>>>>>>Train CER: {0}".format(train_cer)) if step > 0 and step % args.eval_frequency == 0: print_once("Doing Evaluation ....................... ...... ... .. . .") eval() step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break #lnw add lEpochEnd_time = datetime.now() print("Epoch End time: "+str(lEpochEnd_time),"Duration:",str(lEpochEnd_time - lEpochStart_time),"SratTime-NowTime:",str(lEpochEnd_time - lstart_time)) print_once("Done in {0}".format(time.time() - start_time)) print_once("Final Evaluation ....................... ...... ... .. . .") eval() save(model, optimizer, epoch, output_dir=args.output_dir)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert(torch.cuda.is_available()) torch.backends.cudnn.benchmark = args.cudnn # set up distributed training if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = torch.distributed.is_initialized() if multi_gpu: print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) # define amp optimiation level if args.fp16: optim_level = Optimization.mxprO1 else: optim_level = Optimization.mxprO0 jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input'] featurizer_config_eval = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = jasper_model_definition.get('perturb', None) if args.pad_to_max: assert(args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = "max" featurizer_config_eval['pad_to'] = "max" print_once('model_config') print_dict(jasper_model_definition) if args.gradient_accumulation_steps < 1: raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type) data_layer_eval = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ) model = Jasper(feature_config=featurizer_config, jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=True) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 ctc_loss = CTCLossNM( num_classes=len(ctc_vocab)) greedy_decoder = GreedyCTCDecoder() print_once("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) print_once("Number of parameters in decode: {0}".format(model.jasper_decoder.num_weights())) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') fn_lr_policy = lambda s: lr_policy(args.lr, s, args.num_epochs * args.step_per_epoch) model.cuda() if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) if optim_level in AmpOptimizations: model, optimizer = amp.initialize( #lnw block for error #min_loss_scale=1.0, models=model, optimizers=optimizer, opt_level=AmpOptimizations[optim_level]) if args.ckpt is not None: optimizer.load_state_dict(checkpoint['optimizer']) model = model_multi_gpu(model, multi_gpu) train( data_layer=data_layer, data_layer_eval=data_layer_eval, model=model, ctc_loss=ctc_loss, greedy_decoder=greedy_decoder, optimizer=optimizer, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy if args.lr_decay else None, args=args)
def evalutaion(epoch=0): model.eval() if args.ipex: if args.bf16: print("running bfloat16 evaluation step\n") else: print("running fp32 evaluation step\n") for dataset, frequency, name in eval_datasets: if epoch % frequency != 0: continue print_once(f"Doing {name} ....................... ...... ... .. . .") with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } dataloader = dataset.data_iterator for data in dataloader: t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = data_transforms(data) if args.ipex: if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_e, (x_len, y_len) = model( ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), ) else: t_log_probs_e, (x_len, y_len) = model( ((t_audio_signal_e, t_transcript_e), (t_a_sig_length_e, t_transcript_len_e)), ) t_loss_e = loss_fn( (t_log_probs_e, x_len), (t_transcript_e, y_len) ) print(t_loss_e) del t_log_probs_e t_predictions_e = greedy_decoder.decode(t_audio_signal_e, t_a_sig_length_e) values_dict = dict( loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e] ) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) logger.log_scalar('loss', eloss, epoch, name) logger.log_scalar('wer', wer, epoch, name) print_once(f"==========>>>>>>{name} Loss: {eloss}\n") print_once(f"==========>>>>>>{name} WER: {wer}\n")
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) args.local_rank = os.environ.get('LOCAL_RANK', args.local_rank) # set up distributed training cpu_distributed_training = False if torch.distributed.is_available() and int(os.environ.get('PMI_SIZE', '0')) > 1: print('Distributed training with DDP') os.environ['RANK'] = os.environ.get('PMI_RANK', '0') os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') if not 'MASTER_ADDR' in os.environ: os.environ['MASTER_ADDR'] = args.master_addr if not 'MASTER_PORT' in os.environ: os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group( backend=args.backend ) cpu_distributed_training = True if torch.distributed.is_initialized(): print("Torch distributed is initialized.") args.rank = torch.distributed.get_rank() args.world_size = torch.distributed.get_world_size() else: print("Torch distributed is not initialized.") args.rank = 0 args.world_size = 1 multi_gpu = False if multi_gpu: print_once("DISTRIBUTED TRAINING with {} gpus".format(torch.distributed.get_world_size())) optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest tst_manifest = args.tst_manifest featurizer_config = model_definition['input'] featurizer_config_eval = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = model_definition.get('perturb', None) if args.pad_to_max: assert(args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = "max" featurizer_config_eval['pad_to'] = "max" print_once('model_config') print_dict(model_definition) if args.gradient_accumulation_steps < 1: raise ValueError('Invalid gradient accumulation steps parameter {}'.format(args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError('gradient accumulation step {} is not divisible by batch size {}'.format(args.gradient_accumulation_steps, args.batch_size)) preprocessor = preprocessing.AudioPreprocessing(**featurizer_config) if args.cuda: preprocessor.cuda() else: preprocessor.cpu() augmentations = preprocessing.SpectrogramAugmentation(**featurizer_config) if args.cuda: augmentations.cuda() else: augmentations.cpu() train_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [augmentations(xs[0]), *xs[1:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cpu() for x in xs], lambda xs: [*preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type, cpu_distributed_training=cpu_distributed_training) eval_datasets = [( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.eval_frequency, 'Eval clean', )] if tst_manifest: eval_datasets.append(( AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=tst_manifest, labels=dataset_vocab, batch_size=args.eval_batch_size, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max ), args.test_frequency, 'Test other', )) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=True) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 loss_fn = RNNTLoss(blank=len(ctc_vocab) - 1) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size ) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') constant_lr_policy = lambda _: args.lr fn_lr_policy = constant_lr_policy if args.lr_decay: pre_decay_policy = fn_lr_policy fn_lr_policy = lambda s: lr_decay(args.num_epochs * args.step_per_epoch, s, pre_decay_policy(s)) if args.lr_warmup: pre_warmup_policy = fn_lr_policy fn_lr_policy = lambda s: lr_warmup(args.lr_warmup, s, pre_warmup_policy(s) ) if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format(args.optimizer_kind)) if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" if args.ckpt is not None: optimizer.load_state_dict(checkpoint['optimizer']) if args.ipex: if args.bf16: model, optimizer = ipex.optimize(model, dtype=torch.bfloat16, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) else: model, optimizer = ipex.optimize(model, dtype=torch.float32, optimizer=optimizer) ipex.nn.utils._model_convert.replace_lstm_with_ipex_lstm(model) if args.world_size > 1: device_ids = None model = torch.nn.parallel.DistributedDataParallel(model, device_ids=device_ids) print_once(model) print_once("# parameters: {}".format(sum(p.numel() for p in model.parameters()))) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) if args.tb_path and args.local_rank == 0: logger = TensorBoardLogger(args.tb_path, model.module if multi_gpu else model, args.histogram) else: logger = DummyLogger() train( data_layer=data_layer, model=model, loss_fn=loss_fn, greedy_decoder=greedy_decoder, optimizer=optimizer, data_transforms=train_transforms, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy, evalutaion=evaluator(model, eval_transforms, loss_fn, greedy_decoder, ctc_vocab, eval_datasets, logger), logger=logger, args=args)
def train( data_layer, model, loss_fn, greedy_decoder, optimizer, optim_level, labels, multi_gpu, data_transforms, args, evalutaion, logger, fn_lr_policy): """Trains model Args: data_layer: training data layer model: model ( encapsulates data processing, encoder, decoder) loss_fn: loss function greedy_decoder: greedy ctc decoder optimizer: optimizer optim_level: AMP optimization level labels: list of output labels multi_gpu: true if multi gpu training args: script input argument list fn_lr_policy: function returning lr in given step """ print_once("Starting .....") start_time = time.time() train_dataloader = data_layer.data_iterator epoch = args.start_epoch step = epoch * args.step_per_epoch start_step = step if args.ipex: print("is ipex") if args.bf16: print("is bf16") print("running bfloat16 training step\n") elif args.fp32: print("running fp32 training step\n") total_time = 0 while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in tqdm(train_dataloader): if batch_counter == 0: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data) model.train() if args.profiling and (step - start_step) >= args.warmup: with torch.profiler.profile(on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')) as prof: if (step - start_step) >= args.warmup: t0 = time.perf_counter() if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) if args.bf16: t_log_probs_t = t_log_probs_t.to(torch.float32) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) else: if (step - start_step) >= args.warmup: t0 = time.perf_counter() if args.bf16: with torch.cpu.amp.autocast(): t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) elif args.fp32: t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) if args.bf16: t_log_probs_t = t_log_probs_t.to(torch.float32) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if (step + 1) % args.train_frequency == 0: # t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t) # e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] # train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) # logger.log_scalar('wer', train_wer, step) step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break # evalutaion(epoch) if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break if args.profiling: print(prof.key_averages().table(sort_by="self_cpu_time_total")) print_once("Done in {0}".format(time.time() - start_time)) if args.num_steps is not None: total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size else: total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size print("total samples tested: ", total_samples) print("Model training time:", total_time, "s") perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf)) # print_once("Final Evaluation ....................... ...... ... .. . .") # evalutaion() save(model, optimizer, epoch, output_dir=args.output_dir) else: total_time = 0 while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in train_dataloader: if batch_counter == 0: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = data_transforms(data) model.train() if (step - start_step) >= args.warmup: t0 = time.perf_counter() t_log_probs_t, (x_len, y_len) = model( ((t_audio_signal_t, t_transcript_t), (t_a_sig_length_t, t_transcript_len_t)), ) t_loss_t = loss_fn( (t_log_probs_t, x_len), (t_transcript_t, y_len) ) print(t_loss_t) logger.log_scalar('loss', t_loss_t.item(), step) del t_log_probs_t if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if args.cuda and optim_level in AmpOptimizations: assert False, "not supported in ipex" else: t_loss_t.backward() t1 = time.perf_counter() if (step - start_step) >= args.warmup: total_time += (t1 - t0) batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if (step + 1) % args.train_frequency == 0: t_predictions_t = greedy_decoder.decode(t_audio_signal_t, t_a_sig_length_t) e_tensors = [t_predictions_t, t_transcript_t, t_transcript_len_t] train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format(step, str(average_loss))) print_once("Step time: {0} seconds".format(time.time() - last_iter_start)) logger.log_scalar('wer', train_wer, step) step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break # evalutaion(epoch) if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format(epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, optimizer, epoch, output_dir=args.output_dir) if args.num_steps is None and epoch >= args.num_epochs: break print_once("Done in {0}".format(time.time() - start_time)) if args.num_steps is not None: total_samples = (args.num_steps - args.warmup - start_step) * args.batch_size else: total_samples = len(data_layer) * (args.num_epochs - args.start_epoch) - args.warmup * args.batch_size print("total samples tested: ", total_samples) print("Model training time:", total_time, "s") perf = total_samples / total_time print("Throughput: {:.3f} fps".format(perf)) # print_once("Final Evaluation ....................... ...... ... .. . .") # evalutaion() save(model, optimizer, epoch, output_dir=args.output_dir)
def train(data_layer, data_layer_eval, model, ema_model, ctc_loss, greedy_decoder, optimizer, optim_level, labels, multi_gpu, args, fn_lr_policy=None): """Trains model Args: data_layer: training data layer data_layer_eval: evaluation data layer model: model ( encapsulates data processing, encoder, decoder) ctc_loss: loss function greedy_decoder: greedy ctc decoder optimizer: optimizer optim_level: AMP optimization level labels: list of output labels multi_gpu: true if multi gpu training args: script input argument list fn_lr_policy: learning rate adjustment function """ def eval(model, name=''): """Evaluates model on evaluation dataset """ with torch.no_grad(): _global_var_dict = { 'EvalLoss': [], 'predictions': [], 'transcripts': [], } eval_dataloader = data_layer_eval.data_iterator for data in eval_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) t_audio_signal_e, t_a_sig_length_e, t_transcript_e, t_transcript_len_e = tensors model.eval() if optim_level == 1: with amp.disable_casts(): t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) else: t_processed_signal_e, t_processed_sig_length_e = audio_preprocessor( t_audio_signal_e, t_a_sig_length_e) if jasper_encoder.use_conv_mask: t_log_probs_e, t_encoded_len_e = model.forward( (t_processed_signal_e, t_processed_sig_length_e)) else: t_log_probs_e = model.forward(t_processed_signal_e) t_loss_e = ctc_loss(log_probs=t_log_probs_e, targets=t_transcript_e, input_length=t_encoded_len_e, target_length=t_transcript_len_e) t_predictions_e = greedy_decoder(log_probs=t_log_probs_e) values_dict = dict(loss=[t_loss_e], predictions=[t_predictions_e], transcript=[t_transcript_e], transcript_length=[t_transcript_len_e]) process_evaluation_batch(values_dict, _global_var_dict, labels=labels) # final aggregation across all workers and minibatches) and logging of results wer, eloss = process_evaluation_epoch(_global_var_dict) if name != '': name = '_' + name print_once(f"==========>>>>>>Evaluation{name} Loss: {eloss}\n") print_once(f"==========>>>>>>Evaluation{name} WER: {wer}\n") print_once("Starting .....") start_time = time.time() train_dataloader = data_layer.data_iterator epoch = args.start_epoch step = epoch * args.step_per_epoch audio_preprocessor = model.module.audio_preprocessor if hasattr( model, 'module') else model.audio_preprocessor data_spectr_augmentation = model.module.data_spectr_augmentation if hasattr( model, 'module') else model.data_spectr_augmentation jasper_encoder = model.module.jasper_encoder if hasattr( model, 'module') else model.jasper_encoder while True: if multi_gpu: data_layer.sampler.set_epoch(epoch) print_once("Starting epoch {0}, step {1}".format(epoch, step)) last_epoch_start = time.time() batch_counter = 0 average_loss = 0 for data in train_dataloader: tensors = [] for d in data: if isinstance(d, torch.Tensor): tensors.append(d.cuda()) else: tensors.append(d) if batch_counter == 0: if fn_lr_policy is not None: adjusted_lr = fn_lr_policy(step) for param_group in optimizer.param_groups: param_group['lr'] = adjusted_lr optimizer.zero_grad() last_iter_start = time.time() t_audio_signal_t, t_a_sig_length_t, t_transcript_t, t_transcript_len_t = tensors model.train() if optim_level == 1: with amp.disable_casts(): t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor( t_audio_signal_t, t_a_sig_length_t) else: t_processed_signal_t, t_processed_sig_length_t = audio_preprocessor( t_audio_signal_t, t_a_sig_length_t) t_processed_signal_t = data_spectr_augmentation( t_processed_signal_t) if jasper_encoder.use_conv_mask: t_log_probs_t, t_encoded_len_t = model.forward( (t_processed_signal_t, t_processed_sig_length_t)) else: t_log_probs_t = model.forward(t_processed_signal_t) t_loss_t = ctc_loss(log_probs=t_log_probs_t, targets=t_transcript_t, input_length=t_encoded_len_t, target_length=t_transcript_len_t) if args.gradient_accumulation_steps > 1: t_loss_t = t_loss_t / args.gradient_accumulation_steps if 0 < optim_level <= 3: with amp.scale_loss(t_loss_t, optimizer) as scaled_loss: scaled_loss.backward() else: t_loss_t.backward() batch_counter += 1 average_loss += t_loss_t.item() if batch_counter % args.gradient_accumulation_steps == 0: optimizer.step() if step % args.train_frequency == 0: t_predictions_t = greedy_decoder(log_probs=t_log_probs_t) e_tensors = [ t_predictions_t, t_transcript_t, t_transcript_len_t ] train_wer = monitor_asr_train_progress(e_tensors, labels=labels) print_once("Loss@Step: {0} ::::::: {1}".format( step, str(average_loss))) print_once( "Step time: {0} seconds".format(time.time() - last_iter_start)) if step > 0 and step % args.eval_frequency == 0: print_once( "Doing Evaluation ....................... ...... ... .. . ." ) eval(model) if args.ema > 0: eval(ema_model, 'EMA') step += 1 batch_counter = 0 average_loss = 0 if args.num_steps is not None and step >= args.num_steps: break if args.num_steps is not None and step >= args.num_steps: break print_once("Finished epoch {0} in {1}".format( epoch, time.time() - last_epoch_start)) epoch += 1 if epoch % args.save_frequency == 0 and epoch > 0: save(model, ema_model, optimizer, epoch, args.output_dir, optim_level) if args.num_steps is None and epoch >= args.num_epochs: break print_once("Done in {0}".format(time.time() - start_time)) print_once("Final Evaluation ....................... ...... ... .. . .") eval(model) if args.ema > 0: eval(ema_model, 'EMA') save(model, ema_model, optimizer, epoch, args.output_dir, optim_level)