def get_pytorch_components_and_onnx(args):
    '''Returns PyTorch components used for inference
    '''
    model_definition = toml.load(args.model_toml)
    dataset_vocab = model_definition['labels']['labels']
    # Set up global labels for future vocab calls
    global _global_ctc_labels
    _global_ctc_labels = add_ctc_labels(dataset_vocab)
    featurizer_config = model_definition['input_eval']

    optim_level = 3 if args.pyt_fp16 else 0

    featurizer_config["optimization_level"] = optim_level

    audio_preprocessor = None
    onnx_path = None
    data_layer = None
    wav = None
    seq_len = None

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.dataset_dir is not None:
        data_layer = AudioToTextDataLayer(dataset_dir=args.dataset_dir,
                                          featurizer_config=featurizer_config,
                                          manifest_filepath=args.val_manifest,
                                          labels=dataset_vocab,
                                          batch_size=args.batch_size,
                                          shuffle=False)
    if args.wav is not None:
        args.batch_size = 1
        wav, seq_len = audio_from_file(args.wav)
        if args.seq_len is None or args.seq_len == 0:
            args.seq_len = seq_len / (featurizer_config['sample_rate'] / 100)

    if args.transpose:
        featurizer_config["transpose_out"] = True
        model_definition["transpose_in"] = True

    model = JasperEncoderDecoder(jasper_model_definition=model_definition,
                                 feat_in=1024,
                                 num_classes=len(get_vocab()),
                                 transpose_in=args.transpose)
    model = model.cuda()
    model.eval()

    audio_preprocessor = AudioPreprocessing(**featurizer_config)
    audio_preprocessor = audio_preprocessor.cuda()
    audio_preprocessor.eval()

    if args.ckpt_path is not None:
        if os.path.isdir(args.ckpt_path):
            d_checkpoint = torch.load(args.ckpt_path + "/decoder.pt",
                                      map_location="cpu")
            e_checkpoint = torch.load(args.ckpt_path + "/encoder.pt",
                                      map_location="cpu")
            model.jasper_encoder.load_state_dict(e_checkpoint, strict=False)
            model.jasper_decoder.load_state_dict(d_checkpoint, strict=False)
        else:
            checkpoint = torch.load(args.ckpt_path, map_location="cpu")
            model.load_state_dict(checkpoint['state_dict'], strict=False)

    # if we are to produce engine, not run/create ONNX, postpone AMP initialization
    # (ONNX parser cannot handle mixed FP16 ONNX yet)
    if args.pyt_fp16 and args.engine_path is None:
        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])

    if args.make_onnx:
        if args.onnx_path is None or args.ckpt_path is None:
            raise Exception(
                "--ckpt_path, --onnx_path must be provided when using --make_onnx"
            )
        onnx_path = get_onnx(args.onnx_path, model, args)

    if args.pyt_fp16 and args.engine_path is not None:
        amp.initialize(models=model, opt_level=AmpOptimizations[optim_level])

    return {
        'data_layer': data_layer,
        'audio_preprocessor': audio_preprocessor,
        'acoustic_model': model,
        'input_wav': (wav, seq_len)
    }, onnx_path
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = args.cudnn_benchmark
    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
    if not args.cpu_run:
        assert(torch.cuda.is_available())

    if args.local_rank is not None:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    multi_gpu = args.local_rank is not None
    if multi_gpu:
        print("DISTRIBUTED with ", torch.distributed.get_world_size())

    if args.fp16:
        optim_level = 3
    else:
        optim_level = 0

    jasper_model_definition = toml.load(args.model_toml)
    dataset_vocab = jasper_model_definition['labels']['labels']
    ctc_vocab = add_ctc_labels(dataset_vocab)

    val_manifest = args.val_manifest
    featurizer_config = jasper_model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    featurizer_config["fp16"] = args.fp16
    args.use_conv_mask = jasper_model_definition['encoder'].get('convmask', True)

    if args.masked_fill is not None:
        print("{} masked_fill".format("Enabling" if args.masked_fill else "Disabling"))
        jasper_model_definition["encoder"]["conv_mask"] = args.masked_fill

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.pad_to is not None:
        featurizer_config['pad_to'] = args.pad_to 

    if featurizer_config['pad_to'] == "max":
        featurizer_config['pad_to'] = -1
        
    print('=== model_config ===')
    print_dict(jasper_model_definition)
    print()
    print('=== feature_config ===')
    print_dict(featurizer_config)
    print()
    data_layer = None
    
    if args.wav is None:
        data_layer = AudioToTextDataLayer(
            dataset_dir=args.dataset_dir, 
            featurizer_config=featurizer_config,
            manifest_filepath=val_manifest,
            labels=dataset_vocab,
            batch_size=args.batch_size,
            pad_to_max=featurizer_config['pad_to'] == -1,
            shuffle=False,
            multi_gpu=multi_gpu)
    audio_preprocessor = AudioPreprocessing(**featurizer_config)
    encoderdecoder = JasperEncoderDecoder(jasper_model_definition=jasper_model_definition, feat_in=1024, num_classes=len(ctc_vocab))

    if args.ckpt is not None:
        print("loading model from ", args.ckpt)

        if os.path.isdir(args.ckpt):
            exit(0)
        else:
            checkpoint = torch.load(args.ckpt, map_location="cpu")
            for k in audio_preprocessor.state_dict().keys():
                checkpoint['state_dict'][k] = checkpoint['state_dict'].pop("audio_preprocessor." + k)
            audio_preprocessor.load_state_dict(checkpoint['state_dict'], strict=False)
            encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)

    greedy_decoder = GreedyCTCDecoder()

    # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))
    if args.wav is None:
        N = len(data_layer)
        step_per_epoch = math.ceil(N / (args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))

        if args.steps is not None:
            print('-----------------')
            print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
            print('Have {0} steps / (gpu * epoch).'.format(args.steps))
            print('-----------------')
        else:
            print('-----------------')
            print('Have {0} examples to eval on.'.format(N))
            print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
            print('-----------------')

    print ("audio_preprocessor.normalize: ", audio_preprocessor.featurizer.normalize)
    if not args.cpu_run:
        audio_preprocessor.cuda()
        encoderdecoder.cuda()
    if args.fp16:
        encoderdecoder = amp.initialize( models=encoderdecoder,
                                         opt_level=AmpOptimizations[optim_level])

    encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
    audio_preprocessor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()
    
    eval(
        data_layer=data_layer,
        audio_processor=audio_preprocessor,
        encoderdecoder=encoderdecoder,
        greedy_decoder=greedy_decoder,
        labels=ctc_vocab,
        args=args,
        multi_gpu=multi_gpu)
Exemple #3
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = args.cudnn_benchmark
    assert (args.steps is None or args.steps > 5)
    print("CUDNN BENCHMARK ", args.cudnn_benchmark)
    assert (torch.cuda.is_available())

    if args.fp16:
        optim_level = Optimization.mxprO3
    else:
        optim_level = Optimization.mxprO0
    batch_size = args.batch_size

    jasper_model_definition = toml.load(args.model_toml)
    dataset_vocab = jasper_model_definition['labels']['labels']
    ctc_vocab = add_ctc_labels(dataset_vocab)

    val_manifest = args.val_manifest
    featurizer_config = jasper_model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.pad_to is not None:
        featurizer_config[
            'pad_to'] = args.pad_to if args.pad_to >= 0 else "max"

    print('model_config')
    print_dict(jasper_model_definition)
    print('feature_config')
    print_dict(featurizer_config)

    data_layer = AudioToTextDataLayer(
        dataset_dir=args.dataset_dir,
        featurizer_config=featurizer_config,
        manifest_filepath=val_manifest,
        labels=dataset_vocab,
        batch_size=batch_size,
        pad_to_max=featurizer_config['pad_to'] == "max",
        shuffle=False,
        multi_gpu=False)

    audio_preprocessor = AudioPreprocessing(**featurizer_config)

    encoderdecoder = JasperEncoderDecoder(
        jasper_model_definition=jasper_model_definition,
        feat_in=1024,
        num_classes=len(ctc_vocab))

    if args.ckpt is not None:
        print("loading model from ", args.ckpt)
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        for k in audio_preprocessor.state_dict().keys():
            checkpoint['state_dict'][k] = checkpoint['state_dict'].pop(
                "audio_preprocessor." + k)
        audio_preprocessor.load_state_dict(checkpoint['state_dict'],
                                           strict=False)
        encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)

    greedy_decoder = GreedyCTCDecoder()

    # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))

    N = len(data_layer)
    step_per_epoch = math.ceil(N / args.batch_size)

    print('-----------------')
    if args.steps is None:
        print('Have {0} examples to eval on.'.format(N))
        print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
    else:
        print('Have {0} examples to eval on.'.format(args.steps *
                                                     args.batch_size))
        print('Have {0} steps / (gpu * epoch).'.format(args.steps))
    print('-----------------')

    audio_preprocessor.cuda()
    encoderdecoder.cuda()
    if args.fp16:
        encoderdecoder = amp.initialize(
            models=encoderdecoder, opt_level=AmpOptimizations[optim_level])

    eval(data_layer=data_layer,
         audio_processor=audio_preprocessor,
         encoderdecoder=encoderdecoder,
         greedy_decoder=greedy_decoder,
         labels=ctc_vocab,
         args=args)
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    assert (args.steps is None or args.steps > 5)

    if args.cpu:
        device = torch.device('cpu')
    else:
        assert (torch.cuda.is_available())
        device = torch.device('cuda')
        torch.backends.cudnn.benchmark = args.cudnn_benchmark
        print("CUDNN BENCHMARK ", args.cudnn_benchmark)

    optim_level = 3 if args.amp else 0
    batch_size = args.batch_size

    jasper_model_definition = toml.load(args.model_toml)
    dataset_vocab = jasper_model_definition['labels']['labels']
    ctc_vocab = add_ctc_labels(dataset_vocab)

    val_manifest = args.val_manifest
    featurizer_config = jasper_model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration

    # TORCHSCRIPT: Cant use mixed types. Using -1 for "max"
    if args.pad_to is not None:
        featurizer_config['pad_to'] = args.pad_to if args.pad_to >= 0 else -1

    if featurizer_config['pad_to'] == "max":
        featurizer_config['pad_to'] = -1

    args.use_conv_mask = jasper_model_definition['encoder'].get(
        'convmask', True)
    if args.use_conv_mask and args.torch_script:
        print(
            'WARNING: Masked convs currently not supported for TorchScript. Disabling.'
        )
        jasper_model_definition['encoder']['convmask'] = False

    print('model_config')
    print_dict(jasper_model_definition)
    print('feature_config')
    print_dict(featurizer_config)

    data_layer = AudioToTextDataLayer(
        dataset_dir=args.dataset_dir,
        featurizer_config=featurizer_config,
        manifest_filepath=val_manifest,
        labels=dataset_vocab,
        batch_size=batch_size,
        pad_to_max=featurizer_config['pad_to'] == -1,
        shuffle=False,
        multi_gpu=False)

    audio_preprocessor = AudioPreprocessing(**featurizer_config)

    encoderdecoder = JasperEncoderDecoder(
        jasper_model_definition=jasper_model_definition,
        feat_in=1024,
        num_classes=len(ctc_vocab))

    if args.ckpt is not None:
        print("loading model from ", args.ckpt)
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        for k in audio_preprocessor.state_dict().keys():
            checkpoint['state_dict'][k] = checkpoint['state_dict'].pop(
                "audio_preprocessor." + k)
        audio_preprocessor.load_state_dict(checkpoint['state_dict'],
                                           strict=False)
        encoderdecoder.load_state_dict(checkpoint['state_dict'], strict=False)

    greedy_decoder = GreedyCTCDecoder()

    # print("Number of parameters in encoder: {0}".format(model.jasper_encoder.num_weights()))

    N = len(data_layer)
    step_per_epoch = math.ceil(N / args.batch_size)

    print('-----------------')
    if args.steps is None:
        print('Have {0} examples to eval on.'.format(N))
        print('Have {0} steps / (epoch).'.format(step_per_epoch))
    else:
        print('Have {0} examples to eval on.'.format(args.steps *
                                                     args.batch_size))
        print('Have {0} steps / (epoch).'.format(args.steps))
    print('-----------------')

    audio_preprocessor.to(device)
    encoderdecoder.to(device)

    if args.amp:
        encoderdecoder = amp.initialize(models=encoderdecoder,
                                        opt_level='O' + str(optim_level))

    eval(data_layer=data_layer,
         audio_processor=audio_preprocessor,
         encoderdecoder=encoderdecoder,
         greedy_decoder=greedy_decoder,
         labels=ctc_vocab,
         device=device,
         args=args)