def main(args): random.seed(args.seed) np.random.seed(args.seed) #torch.set_default_dtype(torch.double) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark #print("CUDNN BENCHMARK ", args.cudnn_benchmark) if args.cuda: assert(torch.cuda.is_available()) model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" #print('model_config') #print_dict(model_definition) #print('feature_config') #print_dict(featurizer_config) data_layer = None data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, sampler='bucket' #sort by duration ) audio_preprocessor = AudioPreprocessing(**featurizer_config) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None and args.mode in[3]: #print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) audio_preprocessor.featurizer.normalize = "per_feature" if args.cuda: audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = [] if args.cuda: eval_transforms.append(lambda xs: [xs[0].cuda(),xs[1].cuda(), *xs[2:]]) eval_transforms.append(lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]]) # These are just some very confusing transposes, that's all. # BxFxT -> TxBxF eval_transforms.append(lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]]) eval_transforms = torchvision.transforms.Compose(eval_transforms) if args.cuda: model.cuda() # Ideally, I would jit this as well... But this is just the constructor... greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model) eval( data_layer=data_layer, audio_processor=eval_transforms, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.benchmark = args.cudnn_benchmark print("CUDNN BENCHMARK ", args.cudnn_benchmark) assert(torch.cuda.is_available()) if args.local_rank is not None: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') multi_gpu = args.local_rank is not None if multi_gpu: print("DISTRIBUTED with ", torch.distributed.get_world_size()) if args.fp16: optim_level = Optimization.mxprO3 else: optim_level = Optimization.mxprO0 model_definition = toml.load(args.model_toml) dataset_vocab = model_definition['labels']['labels'] ctc_vocab = add_blank_label(dataset_vocab) val_manifest = args.val_manifest featurizer_config = model_definition['input_eval'] featurizer_config["optimization_level"] = optim_level if args.max_duration is not None: featurizer_config['max_duration'] = args.max_duration if args.pad_to is not None: featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else "max" print('model_config') print_dict(model_definition) print('feature_config') print_dict(featurizer_config) data_layer = None if args.wav is None: data_layer = AudioToTextDataLayer( dataset_dir=args.dataset_dir, featurizer_config=featurizer_config, manifest_filepath=val_manifest, labels=dataset_vocab, batch_size=args.batch_size, pad_to_max=featurizer_config['pad_to'] == "max", shuffle=False, multi_gpu=multi_gpu) audio_preprocessor = AudioPreprocessing(**featurizer_config) #encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab)) model = RNNT( feature_config=featurizer_config, rnnt=model_definition['rnnt'], num_classes=len(ctc_vocab) ) if args.ckpt is not None: print("loading model from ", args.ckpt) checkpoint = torch.load(args.ckpt, map_location="cpu") model.load_state_dict(checkpoint['state_dict'], strict=False) #greedy_decoder = GreedyCTCDecoder() # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights())) if args.wav is None: N = len(data_layer) step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) if args.steps is not None: print('-----------------') print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size()))) print('Have {0} steps / (gpu * epoch).'.format(args.steps)) print('-----------------') else: print('-----------------') print('Have {0} examples to eval on.'.format(N)) print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch)) print('-----------------') else: audio_preprocessor.featurizer.normalize = "per_feature" print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize) audio_preprocessor.cuda() audio_preprocessor.eval() eval_transforms = torchvision.transforms.Compose([ lambda xs: [x.cuda() for x in xs], lambda xs: [*audio_preprocessor(xs[0:2]), *xs[2:]], lambda xs: [xs[0].permute(2, 0, 1), *xs[1:]], ]) model.cuda() if args.fp16: model = amp.initialize( models=model, opt_level=AmpOptimizations[optim_level]) model = model_multi_gpu(model, multi_gpu) greedy_decoder = RNNTGreedyDecoder(len(ctc_vocab) - 1, model.module if multi_gpu else model) eval( data_layer=data_layer, audio_processor=eval_transforms, encoderdecoder=model, greedy_decoder=greedy_decoder, labels=ctc_vocab, args=args, multi_gpu=multi_gpu)