def get_pytorch_components_and_onnx(args): '''Returns PyTorch components used for inference ''' model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] # Set up global labels for future vocab calls global _global_ctc_labels _global_ctc_labels = add_ctc_labels(dataset_vocab) featurizer_config = model_definition['input_eval'] optim_level = 3 if args.pyt_fp16 else 0 featurizer_config["optimization_level"] = optim_level audio_preprocessor = None onnx_path = None data_layer = None wav = None seq_len = None if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.dataset_dir is not None: data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=args.val_manifest, labels=dataset_vocab, batch_size=args.batch_size, shuffle=False) if args.wav is not None: args.batch_size = 1 wav, seq_len = audio_from_file(args.wav) if args.seq_len is None or args.seq_len == 0: args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100) if args.transpose: featurizer_config["transpose_out"] = True model_definition["transpose_in"] = True model = JasperEncoderDecoder(jasper_model_definition=model_definition, feat_in=1024, num_classes=len(get_vocab()), transpose_in=args.transpose) model = model.cuda() model.eval() audio_preprocessor = AudioPreprocessing(**featurizer_config) audio_preprocessor = audio_preprocessor.cuda() audio_preprocessor.eval() if args.ckpt_path is not None: if os.path.isdir(args.ckpt_path): d_checkpoint = torch.load(args.ckpt_path + "/decoder.pt", map_location="cpu") e_checkpoint = torch.load(args.ckpt_path + "/encoder.pt", map_location="cpu") model.jasper_encoder.load_state_dict(e_checkpoint, strict=False) model.jasper_decoder.load_state_dict(d_checkpoint, strict=False) else: checkpoint = torch.load(args.ckpt_path, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) # if we are to produce engine, not run/create ONNX, postpone AMP initialization # (ONNX parser cannot handle mixed FP16 ONNX yet) if args.pyt_fp16 and args.engine_path is None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) if args.make_onnx: if args.onnx_path is None or args.ckpt_path is None: raise Exception( "--ckpt_path, --onnx_path must be provided when using --make_onnx" ) onnx_path = get_onnx(args.onnx_path, model, args) if args.pyt_fp16 and args.engine_path is not None: amp.initialize(models=model, opt_level=AmpOptimizations[optim_level]) return { 'data_layer': data_layer, 'audio_preprocessor': audio_preprocessor, 'acoustic_model': model, 'input_wav': (wav, seq_len) }, onnx_path
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) if not args.cpu_run: assert(torch.cuda.is_available()) if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = 3 else: optim_level = 0 jasper_model_definition = toml.load(args.model_toml) dataset_vocab = jasper_model_definition['labels']['labels'] ctc_vocab = add_ctc_labels(dataset_vocab) val_manifest = args.val_manifest featurizer_config = jasper_model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level featurizer_config["fp16"] = args.fp16 args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True) if args.masked_fill is not None: print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling")) jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if featurizer_config['pad_to'] == "max": featurizer_config['pad_to'] = -1 print('=== model_config ===') print_dict(jasper_model_definition) print() print('=== feature_config ===') print_dict(featurizer_config) print() data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == -1, shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) if args.ckpt is not None: print("loading model from ", args.ckpt) if os.path.isdir(args.ckpt): exit(0) else: checkpoint = torch.load(args.ckpt, map_location="cpu") for k in audio_preprocessor.state_dict().keys(): checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k) audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False) encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False) greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) if not args.cpu_run: audio_preprocessor.cuda() encoderdecoder.cuda() if args.fp16: encoderdecoder = amp.initialize( models=encoderdecoder, opt_level=AmpOptimizations[optim_level]) encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu) audio_preprocessor.eval() encoderdecoder.eval() greedy_decoder.eval() eval( data_layer=data_layer, audio_processor=audio_preprocessor, encoderdecoder=encoderdecoder, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)