Exemple #1
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    multi_gpu = args.local_rank is not None

    if args.cpu:
        assert(not multi_gpu)
        device = torch.device('cpu')
    else:
        assert(torch.cuda.is_available())
        device = torch.device('cuda')
        torch.backends.cudnn.benchmark = args.cudnn_benchmark
        print("CUDNN BENCHMARK ", args.cudnn_benchmark)

        if multi_gpu:
            print("DISTRIBUTED with ", torch.distributed.get_world_size())
            torch.cuda.set_device(args.local_rank)
            torch.distributed.init_process_group(
                backend='nccl', init_method='env://')

    optim_level = 3 if args.amp else 0

    with open(args.model_cfg) as f:
        model_definition = yaml.load(f, Loader=yaml.FullLoader)

    dataset_vocab = []
    with open(os.path.join(args.dataset_dir, args.vocab), "r", encoding="utf-8") as f:
        for line in f:
            token = line.split(' ')[0]
            if token == '':
                dataset_vocab.append(' ')
            else:
                dataset_vocab.append(token)

    dataset_vocab = sorted(dataset_vocab)

    ctc_vocab = add_ctc_labels(dataset_vocab)

    val_manifest = args.val_manifest
    featurizer_config = model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    featurizer_config["fp16"] = args.amp

    args.use_conv_mask = model_definition['encoder'].get('convmask', True)
    if args.use_conv_mask and args.export_model:
        print('WARNING: Masked convs currently not supported for TorchScript. Disabling.')
        model_definition['encoder']['convmask'] = False

    if args.max_duration is not None:
        featurizer_config['max_duration'] = args.max_duration
    if args.pad_to is not None:
        featurizer_config['pad_to'] = args.pad_to

    if featurizer_config['pad_to'] == "max":
        featurizer_config['pad_to'] = -1

    print('=== model_config ===')
    print_dict(model_definition)
    print()
    print('=== feature_config ===')
    print_dict(featurizer_config)
    print()
    data_layer = None

    if args.wav is None:
        data_layer = AudioToTextDataLayer(
            task_path=args.task_path,
            dataset_dir=args.dataset_dir,
            featurizer_config=featurizer_config,
            manifest_filepath=val_manifest,
            labels=dataset_vocab,
            batch_size=args.batch_size,
            pad_to_max=featurizer_config['pad_to'] == -1,
            shuffle=False,
            multi_gpu=multi_gpu)
    audio_preprocessor = AudioPreprocessing(**featurizer_config)

    if model_definition["model"] == "Jasper":
        encoderdecoder = JasperEncoderDecoder(
            jasper_model_definition=model_definition, feat_in=1024, num_classes=len(ctc_vocab))

    if args.ckpt is not None:
        print("loading model from ", args.ckpt)

        if os.path.isdir(args.ckpt):
            exit(0)
        else:
            checkpoint = torch.load(args.ckpt, map_location="cpu")
            if args.ema and 'ema_state_dict' in checkpoint:
                print('Loading EMA state dict')
                sd = 'ema_state_dict'
            else:
                sd = 'state_dict'

            for k in audio_preprocessor.state_dict().keys():
                checkpoint[sd][k] = checkpoint[sd].pop(
                    "audio_preprocessor." + k)
            audio_preprocessor.load_state_dict(checkpoint[sd], strict=False)
            encoderdecoder.load_state_dict(checkpoint[sd], strict=False)

    greedy_decoder = GreedyCTCDecoder()

    if args.wav is None:
        N = len(data_layer)
        step_per_epoch = math.ceil(N / (args.batch_size * (
            1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))

        if args.steps is not None:
            print('-----------------')
            print('Have {0} examples to eval on.'.format(args.steps * args.batch_size * (
                1 if not torch.distributed.is_initialized() else torch.distributed.get_world_size())))
            print('Have {0} steps / (gpu * epoch).'.format(args.steps))
            print('-----------------')
        else:
            print('-----------------')
            print('Have {0} examples to eval on.'.format(N))
            print('Have {0} steps / (gpu * epoch).'.format(step_per_epoch))
            print('-----------------')

    print("audio_preprocessor.normalize: ",
          audio_preprocessor.featurizer.normalize)

    audio_preprocessor.to(device)
    encoderdecoder.to(device)

    if args.amp:
        encoderdecoder = amp.initialize(models=encoderdecoder,
                                        opt_level='O'+str(optim_level))

    encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
    audio_preprocessor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()

    eval(
        data_layer=data_layer,
        audio_processor=audio_preprocessor,
        encoderdecoder=encoderdecoder,
        greedy_decoder=greedy_decoder,
        labels=ctc_vocab,
        args=args,
        device=device,
        multi_gpu=multi_gpu)
Exemple #2
0
def main(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    assert (torch.cuda.is_available())
    torch.backends.cudnn.benchmark = args.cudnn

    # set up distributed training
    if args.local_rank is not None:
        torch.cuda.set_device(args.local_rank)

    if args.num_gpus > 1:
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        torch.distributed.is_initialized()
        print_once("DISTRIBUTED TRAINING with {} gpus".format(
            torch.distributed.get_world_size()))
        multi_gpu = True
    else:
        multi_gpu = False
        print_once("TRAINING WITH 1 gpu")

    # define amp optimiation level
    optim_level = 1 if args.amp else 0

    with open(args.model_cfg) as f:
        model_definition = yaml.load(f, Loader=yaml.FullLoader)

    dataset_vocab = []
    with open(os.path.join(args.dataset_dir, args.vocab),
              "r",
              encoding="utf-8") as f:
        for line in f:
            token = line.split(' ')[0]
            if token == '':
                dataset_vocab.append(' ')
            else:
                dataset_vocab.append(token)

    dataset_vocab = sorted(dataset_vocab)

    ctc_vocab = add_ctc_labels(dataset_vocab)

    train_manifest = args.train_manifest
    val_manifest = args.val_manifest
    featurizer_config = model_definition['input']
    featurizer_config_eval = model_definition['input_eval']
    featurizer_config["optimization_level"] = optim_level
    featurizer_config_eval["optimization_level"] = optim_level

    sampler_type = featurizer_config.get("sampler", 'default')
    perturb_config = model_definition.get('perturb', None)
    if args.pad_to_max:
        assert (args.max_duration > 0)
        featurizer_config['max_duration'] = args.max_duration
        featurizer_config_eval['max_duration'] = args.max_duration
        featurizer_config['pad_to'] = -1
        featurizer_config_eval['pad_to'] = -1

    print_once('model_config')
    print_dict(model_definition)

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            'Invalid gradient accumulation steps parameter {}'.format(
                args.gradient_accumulation_steps))
    if args.batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError(
            'gradient accumulation step {} is not divisible by batch size {}'.
            format(args.gradient_accumulation_steps, args.batch_size))

    data_layer = AudioToTextDataLayer(
        task_path=args.task_path,
        dataset_dir=args.dataset_dir,
        featurizer_config=featurizer_config,
        perturb_config=perturb_config,
        manifest_filepath=train_manifest,
        labels=dataset_vocab,
        batch_size=args.batch_size // args.gradient_accumulation_steps,
        multi_gpu=multi_gpu,
        pad_to_max=args.pad_to_max,
        sampler=sampler_type,
        normalize_transcripts=model_definition["input"]
        ["normalize_transcripts"])

    data_layer_eval = AudioToTextDataLayer(
        task_path=args.task_path,
        dataset_dir=args.dataset_dir,
        featurizer_config=featurizer_config_eval,
        manifest_filepath=val_manifest,
        labels=dataset_vocab,
        batch_size=args.batch_size // args.gradient_accumulation_steps,
        multi_gpu=multi_gpu,
        pad_to_max=args.pad_to_max,
        normalize_transcripts=model_definition["input_eval"]
        ["normalize_transcripts"])

    if model_definition["model"] == "Jasper":
        model = Jasper(feature_config=featurizer_config,
                       jasper_model_definition=model_definition,
                       feat_in=1024,
                       num_classes=len(ctc_vocab))

        ctc_loss = CTCLossNM(num_classes=len(ctc_vocab))
        greedy_decoder = GreedyCTCDecoder()

        print_once("Number of parameters in encoder: {0}".format(
            model.encoder.num_weights()))
        print_once("Number of parameters in decode: {0}".format(
            model.decoder.num_weights()))

    N = len(data_layer)
    if sampler_type == 'default':
        args.step_per_epoch = math.ceil(
            N / (args.batch_size *
                 (1 if not torch.distributed.is_initialized() else
                  torch.distributed.get_world_size())))
    elif sampler_type == 'bucket':
        args.step_per_epoch = int(len(data_layer.sampler) / args.batch_size)

    print_once('-----------------')
    print_once('Have {0} examples to train on.'.format(N))
    print_once('Have {0} steps / (gpu * epoch).'.format(args.step_per_epoch))
    print_once('-----------------')

    def fn_lr_policy(s):
        return lr_policy(args.lr, s, args.num_epochs * args.step_per_epoch)

    model.cuda()

    if args.optimizer_kind == "novograd":
        optimizer = Novograd(model.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)
    elif args.optimizer_kind == "adam":
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          weight_decay=args.weight_decay)
    else:
        raise ValueError("invalid optimizer choice: {}".format(
            args.optimizer_kind))

    if 0 < optim_level <= 3:
        model, optimizer = amp.initialize(min_loss_scale=1.0,
                                          models=model,
                                          optimizers=optimizer,
                                          opt_level='O' + str(optim_level))

    if args.ema > 0:
        ema_model = copy.deepcopy(model)
    else:
        ema_model = None

    model = model_multi_gpu(model, multi_gpu)

    if args.ckpt is not None:
        print_once("loading model from {}".format(args.ckpt))
        checkpoint = torch.load(args.ckpt, map_location="cpu")
        if hasattr(model, 'module'):
            model.module.load_state_dict(checkpoint['state_dict'], strict=True)
        else:
            model.load_state_dict(checkpoint['state_dict'], strict=True)

        if args.ema > 0:
            if 'ema_state_dict' in checkpoint:
                if hasattr(ema_model, 'module'):
                    ema_model.module.load_state_dict(
                        checkpoint['ema_state_dict'], strict=True)
                else:
                    ema_model.load_state_dict(checkpoint['ema_state_dict'],
                                              strict=True)
            else:
                print_once(
                    'WARNING: ema_state_dict not found in the checkpoint')
                print_once(
                    'WARNING: initializing EMA model with regular params')
                if hasattr(ema_model, 'module'):
                    ema_model.module.load_state_dict(checkpoint['state_dict'],
                                                     strict=True)
                else:
                    ema_model.load_state_dict(checkpoint['state_dict'],
                                              strict=True)

        optimizer.load_state_dict(checkpoint['optimizer'])

        if optim_level > 0:
            amp.load_state_dict(checkpoint['amp'])

        args.start_epoch = checkpoint['epoch']
    else:
        args.start_epoch = 0

    train(data_layer,
          data_layer_eval,
          model,
          ema_model,
          ctc_loss=ctc_loss,
          greedy_decoder=greedy_decoder,
          optimizer=optimizer,
          labels=ctc_vocab,
          optim_level=optim_level,
          multi_gpu=multi_gpu,
          fn_lr_policy=fn_lr_policy if args.lr_decay else None,
          args=args)
Exemple #3
0
    audio_preprocessor.to(device)
    encoderdecoder.to(device)

    if args.amp:
        encoderdecoder = amp.initialize(models=encoderdecoder,
                                        opt_level='O'+str(optim_level))

    encoderdecoder = model_multi_gpu(encoderdecoder, multi_gpu)
    audio_preprocessor.eval()
    encoderdecoder.eval()
    greedy_decoder.eval()

    eval(
        data_layer=data_layer,
        audio_processor=audio_preprocessor,
        encoderdecoder=encoderdecoder,
        greedy_decoder=greedy_decoder,
        labels=ctc_vocab,
        args=args,
        device=device,
        multi_gpu=multi_gpu)


if __name__ == "__main__":
    args = parse_args()

    print_dict(vars(args))

    main(args)
Exemple #4
0
    def load_model(self):

        if self.cpu:
            self.device = torch.device('cpu')
        else:
            assert (torch.cuda.is_available())
            self.device = torch.device('cuda')

        with open(self.model_cfg) as f:
            model_definition = yaml.load(f, Loader=yaml.FullLoader)

        dataset_vocab = []
        with open(self.vocab, "r", encoding="utf-8") as f:
            for line in f:
                token = line.split(' ')[0]
                if token == '':
                    dataset_vocab.append(' ')
                else:
                    dataset_vocab.append(token)
        dataset_vocab = sorted(dataset_vocab)
        self.labels = add_ctc_labels(dataset_vocab)

        featurizer_config = model_definition['input_eval']
        featurizer_config["fp16"] = self.amp

        self.use_conv_mask = model_definition['encoder'].get('convmask', True)

        if self.max_duration is not None:
            featurizer_config['max_duration'] = self.max_duration
        if self.pad_to is not None:
            featurizer_config['pad_to'] = self.pad_to

        if featurizer_config['pad_to'] == "max":
            featurizer_config['pad_to'] = -1

        print('=== model_config ===')
        print_dict(model_definition)
        print()
        print('=== feature_config ===')
        print_dict(featurizer_config)
        print()
        self.audio_preprocessor = AudioPreprocessing(**featurizer_config)
        self.encoderdecoder = JasperEncoderDecoder(
            jasper_model_definition=model_definition,
            feat_in=1024,
            num_classes=len(self.labels))

        if self.ckpt is not None:
            print("loading model from ", self.ckpt)

            if os.path.isdir(self.ckpt):
                exit(0)
            else:
                checkpoint = torch.load(self.ckpt, map_location="cpu")
                sd = 'state_dict'
                for k in self.audio_preprocessor.state_dict().keys():
                    checkpoint[sd][k] = checkpoint[sd].pop(
                        "audio_preprocessor." + k)
                self.audio_preprocessor.load_state_dict(checkpoint[sd],
                                                        strict=False)
                self.encoderdecoder.load_state_dict(checkpoint[sd],
                                                    strict=False)

        print("audio_preprocessor.normalize: ",
              self.audio_preprocessor.featurizer.normalize)

        self.audio_preprocessor.to(self.device)
        self.encoderdecoder.to(self.device)
        self.ctc_decoder = CTCDecoder()

        self.audio_preprocessor.eval()
        self.encoderdecoder.eval()
        self.ctc_decoder.eval()