def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) multi_gpu = args.local_rank is not None if args.cpu: assert(not multi_gpu) device = torch.device('cpu') else: assert(torch.cuda.is_available()) device = torch.device('cuda') torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group( backend='nccl', init_method='env://') optim_level = 3 if args.amp else 0 with open(args.model_cfg) as f: model_definition = yaml.load(f, Loader=yaml.FullLoader) dataset_vocab = [] with open(os.path.join(args.dataset_dir, args.vocab), "r", encoding="utf-8") as f: for line in f: token = line.split(' ')[0] if token == '': dataset_vocab.append(' ') else: dataset_vocab.append(token) dataset_vocab = sorted(dataset_vocab) ctc_vocab = add_ctc_labels(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config["fp16"] = args.amp args.use_conv_mask = model_definition['encoder'].get('convmask', True) if args.use_conv_mask and args.export_model: print('WARNING: Masked convs currently not supported for TorchScript. Disabling.') model_definition['encoder']['convmask'] = False if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if featurizer_config['pad_to'] == "max": featurizer_config['pad_to'] = -1 print('=== model_config ===') print_dict(model_definition) print() print('=== feature_config ===') print_dict(featurizer_config) print() data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( task_path=args.task_path, dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == -1, shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) if model_definition["model"] == "Jasper": encoderdecoder = JasperEncoderDecoder( jasper_model_definition=model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print("loading model from ", args.ckpt) if os.path.isdir(args.ckpt): exit(0) else: checkpoint = torch.load(args.ckpt, map_location="cpu") if args.ema and 'ema_state_dict' in checkpoint: print('Loading EMA state dict') sd = 'ema_state_dict' else: sd = 'state_dict' for k in audio_preprocessor.state_dict().keys(): checkpoint[sd][k] = checkpoint[sd].pop( "audio_preprocessor." + k) audio_preprocessor.load_state_dict(checkpoint[sd], strict=False) encoderdecoder.load_state_dict(checkpoint[sd], strict=False) greedy_decoder = GreedyCTCDecoder() if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * ( 1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * ( 1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') print("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.to(device) encoderdecoder.to(device) if args.amp: encoderdecoder = amp.initialize(models=encoderdecoder, opt_level='O'+str(optim_level)) encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu) audio_preprocessor.eval() encoderdecoder.eval() greedy_decoder.eval() eval( data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, device=device, multi_gpu=multi_gpu)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert (torch.cuda.is_available()) torch.backends.cudnn.benchmark = args.cudnn # set up distributed training if args.local_rank is not None: torch.cuda.set_device(args.local_rank) if args.num_gpus > 1: torch.distributed.init_process_group(backend='nccl', init_method='env://') torch.distributed.is_initialized() print_once("DISTRIBUTED TRAINING with {} gpus".format( torch.distributed.get_world_size())) multi_gpu = True else: multi_gpu = False print_once("TRAINING WITH 1 gpu") # define amp optimiation level optim_level = 1 if args.amp else 0 with open(args.model_cfg) as f: model_definition = yaml.load(f, Loader=yaml.FullLoader) dataset_vocab = [] with open(os.path.join(args.dataset_dir, args.vocab), "r", encoding="utf-8") as f: for line in f: token = line.split(' ')[0] if token == '': dataset_vocab.append(' ') else: dataset_vocab.append(token) dataset_vocab = sorted(dataset_vocab) ctc_vocab = add_ctc_labels(dataset_vocab) train_manifest = args.train_manifest val_manifest = args.val_manifest featurizer_config = model_definition['input'] featurizer_config_eval = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config_eval["optimization_level"] = optim_level sampler_type = featurizer_config.get("sampler", 'default') perturb_config = model_definition.get('perturb', None) if args.pad_to_max: assert (args.max_duration > 0) featurizer_config['max_duration'] = args.max_duration featurizer_config_eval['max_duration'] = args.max_duration featurizer_config['pad_to'] = -1 featurizer_config_eval['pad_to'] = -1 print_once('model_config') print_dict(model_definition) if args.gradient_accumulation_steps < 1: raise ValueError( 'Invalid gradient accumulation steps parameter {}'.format( args.gradient_accumulation_steps)) if args.batch_size % args.gradient_accumulation_steps != 0: raise ValueError( 'gradient accumulation step {} is not divisible by batch size {}'. format(args.gradient_accumulation_steps, args.batch_size)) data_layer = AudioToTextDataLayer( task_path=args.task_path, dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, perturb_config=perturb_config, manifest_filepath=train_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, sampler=sampler_type, normalize_transcripts=model_definition["input"] ["normalize_transcripts"]) data_layer_eval = AudioToTextDataLayer( task_path=args.task_path, dataset_dir=args.dataset_dir, featurizer_config=featurizer_config_eval, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size // args.gradient_accumulation_steps, multi_gpu=multi_gpu, pad_to_max=args.pad_to_max, normalize_transcripts=model_definition["input_eval"] ["normalize_transcripts"]) if model_definition["model"] == "Jasper": model = Jasper(feature_config=featurizer_config, jasper_model_definition=model_definition, feat_in=1024, num_classes=len(ctc_vocab)) ctc_loss = CTCLossNM(num_classes=len(ctc_vocab)) greedy_decoder = GreedyCTCDecoder() print_once("Number of parameters in encoder: {0}".format( model.encoder.num_weights())) print_once("Number of parameters in decode: {0}".format( model.decoder.num_weights())) N = len(data_layer) if sampler_type == 'default': args.step_per_epoch = math.ceil( N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) elif sampler_type == 'bucket': args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size) print_once('-----------------') print_once('Have {0} examples to train on.'.format(N)) print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch)) print_once('-----------------') def fn_lr_policy(s): return lr_policy(args.lr, s, args.num_epochs * args.step_per_epoch) model.cuda() if args.optimizer_kind == "novograd": optimizer = Novograd(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_kind == "adam": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise ValueError("invalid optimizer choice: {}".format( args.optimizer_kind)) if 0 < optim_level <= 3: model, optimizer = amp.initialize(min_loss_scale=1.0, models=model, optimizers=optimizer, opt_level='O' + str(optim_level)) if args.ema > 0: ema_model = copy.deepcopy(model) else: ema_model = None model = model_multi_gpu(model, multi_gpu) if args.ckpt is not None: print_once("loading model from {}".format(args.ckpt)) checkpoint = torch.load(args.ckpt, map_location="cpu") if hasattr(model, 'module'): model.module.load_state_dict(checkpoint['state_dict'], strict=True) else: model.load_state_dict(checkpoint['state_dict'], strict=True) if args.ema > 0: if 'ema_state_dict' in checkpoint: if hasattr(ema_model, 'module'): ema_model.module.load_state_dict( checkpoint['ema_state_dict'], strict=True) else: ema_model.load_state_dict(checkpoint['ema_state_dict'], strict=True) else: print_once( 'WARNING: ema_state_dict not found in the checkpoint') print_once( 'WARNING: initializing EMA model with regular params') if hasattr(ema_model, 'module'): ema_model.module.load_state_dict(checkpoint['state_dict'], strict=True) else: ema_model.load_state_dict(checkpoint['state_dict'], strict=True) optimizer.load_state_dict(checkpoint['optimizer']) if optim_level > 0: amp.load_state_dict(checkpoint['amp']) args.start_epoch = checkpoint['epoch'] else: args.start_epoch = 0 train(data_layer, data_layer_eval, model, ema_model, ctc_loss=ctc_loss, greedy_decoder=greedy_decoder, optimizer=optimizer, labels=ctc_vocab, optim_level=optim_level, multi_gpu=multi_gpu, fn_lr_policy=fn_lr_policy if args.lr_decay else None, args=args)
audio_preprocessor.to(device) encoderdecoder.to(device) if args.amp: encoderdecoder = amp.initialize(models=encoderdecoder, opt_level='O'+str(optim_level)) encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu) audio_preprocessor.eval() encoderdecoder.eval() greedy_decoder.eval() eval( data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, device=device, multi_gpu=multi_gpu) if __name__ == "__main__": args = parse_args() print_dict(vars(args)) main(args)
def load_model(self): if self.cpu: self.device = torch.device('cpu') else: assert (torch.cuda.is_available()) self.device = torch.device('cuda') with open(self.model_cfg) as f: model_definition = yaml.load(f, Loader=yaml.FullLoader) dataset_vocab = [] with open(self.vocab, "r", encoding="utf-8") as f: for line in f: token = line.split(' ')[0] if token == '': dataset_vocab.append(' ') else: dataset_vocab.append(token) dataset_vocab = sorted(dataset_vocab) self.labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] featurizer_config["fp16"] = self.amp self.use_conv_mask = model_definition['encoder'].get('convmask', True) if self.max_duration is not None: featurizer_config['max_duration'] = self.max_duration if self.pad_to is not None: featurizer_config['pad_to'] = self.pad_to if featurizer_config['pad_to'] == "max": featurizer_config['pad_to'] = -1 print('=== model_config ===') print_dict(model_definition) print() print('=== feature_config ===') print_dict(featurizer_config) print() self.audio_preprocessor = AudioPreprocessing(**featurizer_config) self.encoderdecoder = JasperEncoderDecoder( jasper_model_definition=model_definition, feat_in=1024, num_classes=len(self.labels)) if self.ckpt is not None: print("loading model from ", self.ckpt) if os.path.isdir(self.ckpt): exit(0) else: checkpoint = torch.load(self.ckpt, map_location="cpu") sd = 'state_dict' for k in self.audio_preprocessor.state_dict().keys(): checkpoint[sd][k] = checkpoint[sd].pop( "audio_preprocessor." + k) self.audio_preprocessor.load_state_dict(checkpoint[sd], strict=False) self.encoderdecoder.load_state_dict(checkpoint[sd], strict=False) print("audio_preprocessor.normalize: ", self.audio_preprocessor.featurizer.normalize) self.audio_preprocessor.to(self.device) self.encoderdecoder.to(self.device) self.ctc_decoder = CTCDecoder() self.audio_preprocessor.eval() self.encoderdecoder.eval() self.ctc_decoder.eval()