Exemple #1
0
def load_and_setup_model(model_name,
                         parser,
                         checkpoint,
                         forward_is_infer=False):
    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, _ = model_parser.parse_known_args()

    model_config = models.get_model_config(model_name, model_args)
    #    model = models.get_model(model_name, model_config, to_cuda=False,
    model = models.get_model(model_name,
                             model_config,
                             forward_is_infer=forward_is_infer)

    if checkpoint is not None:
        state_dict = torch.load(checkpoint, map_location=device)['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)

    if model_name == "WaveGlow":
        model = model.remove_weightnorm(model)

    model.eval()

    #    if amp_run:
    #        model.half()

    return model
Exemple #2
0
def load_and_setup_model(model_name, parser, checkpoint, amp, device,
                         unk_args=[], forward_is_infer=False, ema=True,
                         jitable=False):

    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, model_unk_args = model_parser.parse_known_args()
    unk_args[:] = list(set(unk_args) & set(model_unk_args))

    model_config = models.get_model_config(model_name, model_args)

    model = models.get_model(model_name, model_config, device,
                             forward_is_infer=forward_is_infer,
                             jitable=jitable)

    if checkpoint is not None:
        model = load_model_from_ckpt(checkpoint, ema, model)

    if model_name == "WaveGlow":
        for k, m in model.named_modules():
            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatability

        model = model.remove_weightnorm(model)

    if amp:
        model.half()
    model.eval()
    return model.to(device)
def load_and_setup_model(model_name, parser, checkpoint, amp_run, cpu_run, forward_is_infer=False):
    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, _ = model_parser.parse_known_args()

    model_config = models.get_model_config(model_name, model_args)
    model = models.get_model(model_name, model_config, cpu_run, forward_is_infer=forward_is_infer)

    if checkpoint is not None:
        if cpu_run:
            state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))['state_dict']
        else:
            state_dict = torch.load(checkpoint)['state_dict']

        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)

    if model_name == "WaveGlow":
        model = model.remove_weightnorm(model)

    model.eval()

    if amp_run:
        model, _ = amp.initialize(model, [], opt_level="O3")

    return model
Exemple #4
0
def load_and_setup_model(model_name,
                         parser,
                         checkpoint,
                         amp_run,
                         rename=False):
    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, _ = model_parser.parse_known_args()

    model_config = models.get_model_config(model_name, model_args)
    model = models.get_model(model_name,
                             model_config,
                             to_cuda=True,
                             rename=rename)

    if checkpoint is not None:
        state_dict = torch.load(checkpoint)['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)

    if model_name == "WaveGlow":
        model = model.remove_weightnorm(model)

    model.eval()

    if amp_run:
        model.half()

    return model
Exemple #5
0
def get_model(**model_args):

    import argparse
    args = argparse.Namespace(**model_args)

    model_config = models.get_model_config(model_name="FastPitch", args=args)

    jittable = True if 'ts-' in args.output_format else False

    model = models.get_model(model_name="FastPitch",
                             model_config=model_config,
                             device='cuda',
                             forward_is_infer=True,
                             jitable=jittable)
    model = load_model_from_ckpt(args.checkpoint, args.ema, model)
    if args.precision == "fp16":
        model = model.half()
    model.eval()
    tensor_names = {
        "inputs": ["INPUT__0"],
        "outputs":
        ["OUTPUT__0", "OUTPUT__1", "OUTPUT__2", "OUTPUT__3", "OUTPUT__4"]
    }

    return model, tensor_names
def predict(model_config, file_location):
    model = load_model(model_config['filepath_weight'], model_config['filepath_architechture'])

    # images format conversion
    a = []
    img = skio.imread(file_location)
    img = resize(img, (16, 8))
    img = img.tolist()
    a.append(img)
    img = np.asarray(a)
    x_test = img

    # Confidence of all alphabets
    prediction = model.predict(x_test, batch_size=32, verbose=0)
    result = np.argmax(prediction, axis=1)
    result = result.tolist()
    for i in prediction:
      confidence = prediction[0][result]

    result_alphabet = [class_mapping[int(x)] for x in result]
    confidence= Decimal(confidence[0]*100)

    confidence = Decimal(confidence.quantize(Decimal('.01'), rounding=ROUND_HALF_UP))
    return result_alphabet[0], confidence
    
    model_config = get_model_config('larger_CNN')
def build_model(model_args, device):
    model_name = "FastPitch"

    model_config = models.get_model_config(model_name, model_args)

    model = models.get_model(model_name,
                             model_config,
                             device,
                             forward_is_infer=True,
                             jitable=False)
    return model
Exemple #8
0
class Model():






if __name__ == '__main__':
    arch = 'faster_rcnn'
    model_cfg = get_model_config(arch)
    model = model_cfg['model']
    dataset = BaseDataset(img_mask_path, 'PNGImages', 'PedMasks', transform=model_cfg['transforms'])
    baseloader = BaseDataLoader(dataset, batch_size=BATCH_SIZE)
def load_and_setup_model(model_name, parser, checkpoint, fp16_run):
    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, _ = model_parser.parse_known_args()

    model_config = models.get_model_config(model_name, model_args)
    model = models.get_model(model_name, model_config, to_fp16=fp16_run, to_cuda=True, training=False)

    if checkpoint is not None:
        state_dict = torch.load(checkpoint)['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)
    model.eval()

    return model
Exemple #10
0
def load_and_setup_model(model_name,
                         parser,
                         checkpoint,
                         amp_run,
                         device,
                         unk_args=[],
                         forward_is_infer=False,
                         ema=True,
                         jitable=False):
    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, model_unk_args = model_parser.parse_known_args()
    unk_args[:] = list(set(unk_args) & set(model_unk_args))

    model_config = models.get_model_config(model_name, model_args)

    model = models.get_model(model_name,
                             model_config,
                             device,
                             forward_is_infer=forward_is_infer,
                             jitable=jitable)

    if checkpoint is not None:
        checkpoint_data = torch.load(checkpoint)
        status = ''

        if 'state_dict' in checkpoint_data:
            sd = checkpoint_data['state_dict']
            if ema and 'ema_state_dict' in checkpoint_data:
                sd = checkpoint_data['ema_state_dict']
                status += ' (EMA)'
            elif ema and not 'ema_state_dict' in checkpoint_data:
                print(f'WARNING: EMA weights missing for {model_name}')

            if any(key.startswith('module.') for key in sd):
                sd = {k.replace('module.', ''): v for k, v in sd.items()}
            status += ' ' + str(model.load_state_dict(sd, strict=False))
        else:
            model = checkpoint_data['model']
        print(f'Loaded {model_name}{status}')

    if model_name == "WaveGlow":
        model = model.remove_weightnorm(model)
    if amp_run:
        model.half()
    model.eval()
    return model.to(device)
Exemple #11
0
def get_experiment_config(args, dataset):
    model_generator = get_model_generator(args.model)
    model_config = get_model_config(args.model, args, dataset)
    mode = dataset.problem_type

    if mode == 'clf':  # classification
        loss_fn = CrossEntropy(target_names=dataset.target_names, )
        monitors = classification_monitors(args, dataset)
    elif mode == 'reg':  # regression
        loss_fn = MSEGraphLoss(target_names=dataset.target_names, )
        monitors = regression_monitors(args, dataset)

    config = ExperimentConfig(model_generator=model_generator,
                              model_config=model_config,
                              mode=mode,
                              loss_fn=loss_fn,
                              monitors=monitors)
    return config
Exemple #12
0
def load_and_setup_model(model_name: str, parser: ArgumentParser, checkpoint: str, 
                         fp16_run: bool, cpu_run: bool, forward_is_infer: bool = False):
    """[summary]

    Args:
        model_name (str): One of the 'Tacotron2' or 'WaveGlow'.
        parser (ArgumentParser): [description]
        checkpoint (str): [description]
        fp16_run (bool): [description]
        cpu_run (bool): [description]
        forward_is_infer (bool, optional): [description]. Defaults to False.

    Returns:
        [type]: [description]
    """

    model_parser = models.parse_model_args(model_name, parser, add_help=False)
    model_args, _ = model_parser.parse_known_args()
    model_config = models.get_model_config(model_name, model_args)
    # print(model_config)
    model = models.get_model(model_name, model_config, cpu_run=cpu_run,
                             forward_is_infer=forward_is_infer)

    if checkpoint is not None:
        if cpu_run:
            state_dict = torch.load(checkpoint, map_location=torch.device('cpu'))['state_dict']
        else:
            state_dict = torch.load(checkpoint)['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)

    if model_name == "WaveGlow":
        model = model.remove_weightnorm(model)

    model.eval()

    if fp16_run:
        model.half()

    return model
Exemple #13
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Training',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    if args.p_arpabet > 0.0:
        cmudict.initialize(args.cmudict_path, keep_ambiguous=True)

    distributed_run = args.world_size > 1

    torch.manual_seed(args.seed + args.local_rank)
    np.random.seed(args.seed + args.local_rank)

    if args.local_rank == 0:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

    log_fpath = args.log_file or os.path.join(args.output, 'nvlog.json')
    tb_subsets = ['train', 'val']
    if args.ema_decay > 0.0:
        tb_subsets.append('val_ema')

    logger.init(log_fpath,
                args.output,
                enabled=(args.local_rank == 0),
                tb_subsets=tb_subsets)
    logger.parameters(vars(args), tb_subset='train')

    parser = models.parse_model_args('FastPitch', parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, args.world_size, args.local_rank)

    device = torch.device('cuda' if args.cuda else 'cpu')
    model_config = models.get_model_config('FastPitch', args)
    model = models.get_model('FastPitch', model_config, device)

    attention_kl_loss = AttentionBinarizationLoss()

    # Store pitch mean/std as params to translate from Hz during inference
    model.pitch_mean[0] = args.pitch_mean
    model.pitch_std[0] = args.pitch_std

    kw = dict(lr=args.learning_rate,
              betas=(0.9, 0.98),
              eps=1e-9,
              weight_decay=args.weight_decay)
    if args.optimizer == 'adam':
        optimizer = FusedAdam(model.parameters(), **kw)
    elif args.optimizer == 'lamb':
        optimizer = FusedLAMB(model.parameters(), **kw)
    else:
        raise ValueError

    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)

    if args.ema_decay > 0:
        ema_model = copy.deepcopy(model)
    else:
        ema_model = None

    if distributed_run:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank,
                                        find_unused_parameters=True)

    start_epoch = [1]
    start_iter = [0]

    assert args.checkpoint_path is None or args.resume is False, (
        "Specify a single checkpoint source")
    if args.checkpoint_path is not None:
        ch_fpath = args.checkpoint_path
    elif args.resume:
        ch_fpath = last_checkpoint(args.output)
    else:
        ch_fpath = None

    if ch_fpath is not None:
        load_checkpoint(args, model, ema_model, optimizer, scaler, start_epoch,
                        start_iter, model_config, ch_fpath)

    start_epoch = start_epoch[0]
    total_iter = start_iter[0]

    criterion = FastPitchLoss(
        dur_predictor_loss_scale=args.dur_predictor_loss_scale,
        pitch_predictor_loss_scale=args.pitch_predictor_loss_scale,
        attn_loss_scale=args.attn_loss_scale)

    collate_fn = TTSCollate()

    if args.local_rank == 0:
        prepare_tmp(args.pitch_online_dir)

    trainset = TTSDataset(audiopaths_and_text=args.training_files,
                          **vars(args))
    valset = TTSDataset(audiopaths_and_text=args.validation_files,
                        **vars(args))

    if distributed_run:
        train_sampler, shuffle = DistributedSampler(trainset), False
    else:
        train_sampler, shuffle = None, True

    # 4 workers are optimal on DGX-1 (from epoch 2 onwards)
    train_loader = DataLoader(trainset,
                              num_workers=4,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=True,
                              persistent_workers=True,
                              drop_last=True,
                              collate_fn=collate_fn)

    if args.ema_decay:
        mt_ema_params = init_multi_tensor_ema(model, ema_model)

    model.train()

    bmark_stats = BenchmarkStats()

    torch.cuda.synchronize()
    for epoch in range(start_epoch, args.epochs + 1):
        epoch_start_time = time.perf_counter()

        epoch_loss = 0.0
        epoch_mel_loss = 0.0
        epoch_num_frames = 0
        epoch_frames_per_sec = 0.0

        if distributed_run:
            train_loader.sampler.set_epoch(epoch)

        accumulated_steps = 0
        iter_loss = 0
        iter_num_frames = 0
        iter_meta = {}
        iter_start_time = time.perf_counter()

        epoch_iter = 0
        num_iters = len(train_loader) // args.grad_accumulation
        for batch in train_loader:

            if accumulated_steps == 0:
                if epoch_iter == num_iters:
                    break
                total_iter += 1
                epoch_iter += 1

                adjust_learning_rate(total_iter, optimizer, args.learning_rate,
                                     args.warmup_steps)

                model.zero_grad(set_to_none=True)

            x, y, num_frames = batch_to_gpu(batch)

            with torch.cuda.amp.autocast(enabled=args.amp):
                y_pred = model(x)
                loss, meta = criterion(y_pred, y)

                if (args.kl_loss_start_epoch is not None
                        and epoch >= args.kl_loss_start_epoch):

                    if args.kl_loss_start_epoch == epoch and epoch_iter == 1:
                        print('Begin hard_attn loss')

                    _, _, _, _, _, _, _, _, attn_soft, attn_hard, _, _ = y_pred
                    binarization_loss = attention_kl_loss(attn_hard, attn_soft)
                    kl_weight = min(
                        (epoch - args.kl_loss_start_epoch) /
                        args.kl_loss_warmup_epochs, 1.0) * args.kl_loss_weight
                    meta['kl_loss'] = binarization_loss.clone().detach(
                    ) * kl_weight
                    loss += kl_weight * binarization_loss

                else:
                    meta['kl_loss'] = torch.zeros_like(loss)
                    kl_weight = 0
                    binarization_loss = 0

                loss /= args.grad_accumulation

            meta = {k: v / args.grad_accumulation for k, v in meta.items()}

            if args.amp:
                scaler.scale(loss).backward()
            else:
                loss.backward()

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
                reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
                meta = {
                    k: reduce_tensor(v, args.world_size)
                    for k, v in meta.items()
                }
            else:
                reduced_loss = loss.item()
                reduced_num_frames = num_frames.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            accumulated_steps += 1
            iter_loss += reduced_loss
            iter_num_frames += reduced_num_frames
            iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta}

            if accumulated_steps % args.grad_accumulation == 0:

                logger.log_grads_tb(total_iter, model)
                if args.amp:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_thresh)
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_thresh)
                    optimizer.step()

                if args.ema_decay > 0.0:
                    apply_multi_tensor_ema(args.ema_decay, *mt_ema_params)

                iter_mel_loss = iter_meta['mel_loss'].item()
                iter_kl_loss = iter_meta['kl_loss'].item()
                iter_time = time.perf_counter() - iter_start_time
                epoch_frames_per_sec += iter_num_frames / iter_time
                epoch_loss += iter_loss
                epoch_num_frames += iter_num_frames
                epoch_mel_loss += iter_mel_loss

                log(
                    (epoch, epoch_iter, num_iters),
                    tb_total_steps=total_iter,
                    subset='train',
                    data=OrderedDict([
                        ('loss', iter_loss), ('mel_loss', iter_mel_loss),
                        ('kl_loss', iter_kl_loss), ('kl_weight', kl_weight),
                        ('frames/s', iter_num_frames / iter_time),
                        ('took', iter_time),
                        ('lrate', optimizer.param_groups[0]['lr'])
                    ]),
                )

                accumulated_steps = 0
                iter_loss = 0
                iter_num_frames = 0
                iter_meta = {}
                iter_start_time = time.perf_counter()

        # Finished epoch
        epoch_loss /= epoch_iter
        epoch_mel_loss /= epoch_iter
        epoch_time = time.perf_counter() - epoch_start_time

        log(
            (epoch, ),
            tb_total_steps=None,
            subset='train_avg',
            data=OrderedDict([('loss', epoch_loss),
                              ('mel_loss', epoch_mel_loss),
                              ('frames/s', epoch_num_frames / epoch_time),
                              ('took', epoch_time)]),
        )
        bmark_stats.update(epoch_num_frames, epoch_loss, epoch_mel_loss,
                           epoch_time)

        validate(model, epoch, total_iter, criterion, valset, args.batch_size,
                 collate_fn, distributed_run, batch_to_gpu)

        if args.ema_decay > 0:
            validate(ema_model,
                     epoch,
                     total_iter,
                     criterion,
                     valset,
                     args.batch_size,
                     collate_fn,
                     distributed_run,
                     batch_to_gpu,
                     ema=True)

        maybe_save_checkpoint(args, model, ema_model, optimizer, scaler, epoch,
                              total_iter, model_config)
        logger.flush()

    # Finished training
    if len(bmark_stats) > 0:
        log((),
            tb_total_steps=None,
            subset='train_avg',
            data=bmark_stats.get(args.benchmark_epochs_num))

    validate(model, None, total_iter, criterion, valset, args.batch_size,
             collate_fn, distributed_run, batch_to_gpu)
Exemple #14
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        local_rank = int(os.environ['LOCAL_RANK'])
        world_size = int(os.environ['WORLD_SIZE'])
    else:
        local_rank = args.rank
        world_size = args.world_size

    distributed_run = world_size > 1

    if local_rank == 0:
        log_file = os.path.join(args.output, args.log_file)
        DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
                                StdOutBackend(Verbosity.VERBOSE)])
    else:
        DLLogger.init(backends=[])

    for k,v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    model_name = args.model_name
    parser = models.model_parser(model_name, parser)
    args, _ = parser.parse_known_args()

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, world_size, local_rank, args.group_name)

    torch.cuda.synchronize()
    run_start_time = time.perf_counter()

    model_config = models.get_model_config(model_name, args)
    model = models.get_model(model_name, model_config,
                             cpu_run=False,
                             uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)

    if distributed_run:
        model = DDP(model,device_ids=[local_rank],output_device=local_rank)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)

    try:
        sigma = args.sigma
    except AttributeError:
        sigma = None

    start_epoch = [0]

    if args.resume_from_last:
        args.checkpoint_path = get_last_checkpoint_filename(args.output, model_name)

    if args.checkpoint_path is not "":
        load_checkpoint(model, optimizer, start_epoch, model_config,
                        args.amp, args.checkpoint_path, local_rank)

    start_epoch = start_epoch[0]

    criterion = loss_functions.get_loss_function(model_name, sigma)

    try:
        n_frames_per_step = args.n_frames_per_step
    except AttributeError:
        n_frames_per_step = None

    collate_fn = data_functions.get_collate_function(
        model_name, n_frames_per_step)
    trainset = data_functions.get_data_loader(
        model_name, args.dataset_path, args.training_files, args)
    if distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=args.batch_size, pin_memory=False,
                              drop_last=True, collate_fn=collate_fn)

    valset = data_functions.get_data_loader(
        model_name, args.dataset_path, args.validation_files, args)

    batch_to_gpu = data_functions.get_batch_to_gpu(model_name)

    iteration = 0
    train_epoch_items_per_sec = 0.0
    val_loss = 0.0
    num_iters = 0

    model.train()

    for epoch in range(start_epoch, args.epochs):
        torch.cuda.synchronize()
        epoch_start_time = time.perf_counter()
        # used to calculate avg items/sec over epoch
        reduced_num_items_epoch = 0

        train_epoch_items_per_sec = 0.0

        num_iters = 0
        reduced_loss = 0

        # if overflow at the last iteration then do not save checkpoint
        overflow = False

        if distributed_run:
            train_loader.sampler.set_epoch(epoch)

        for i, batch in enumerate(train_loader):
            torch.cuda.synchronize()
            iter_start_time = time.perf_counter()
            DLLogger.log(step=(epoch, i),
                         data={'glob_iter/iters_per_epoch': str(iteration)+"/"+str(len(train_loader))})

            adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate,
                                 args.anneal_steps, args.anneal_factor, local_rank)

            model.zero_grad()
            x, y, num_items = batch_to_gpu(batch)

            #AMP upstream autocast
            with torch.cuda.amp.autocast(enabled=args.amp):
                y_pred = model(x)
                loss = criterion(y_pred, y)
            
            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, world_size).item()
                reduced_num_items = reduce_tensor(num_items.data, 1).item()
            else:
                reduced_loss = loss.item()
                reduced_num_items = num_items.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            DLLogger.log(step=(epoch,i), data={'train_loss': reduced_loss})

            num_iters += 1

            # accumulate number of items processed in this epoch
            reduced_num_items_epoch += reduced_num_items

            if args.amp:
                scaler.scale(loss).backward()

                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)
                
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)  

            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)

                optimizer.step()

            torch.cuda.synchronize()
            iter_stop_time = time.perf_counter()
            iter_time = iter_stop_time - iter_start_time
            items_per_sec = reduced_num_items/iter_time
            train_epoch_items_per_sec += items_per_sec

            DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec})
            DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time})
            iteration += 1

        torch.cuda.synchronize()
        epoch_stop_time = time.perf_counter()
        epoch_time = epoch_stop_time - epoch_start_time

        DLLogger.log(step=(epoch,), data={'train_items_per_sec':
                                          (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
        DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss})
        DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})

        val_loss, val_items_per_sec = validate(model, criterion, valset, epoch,
                                               iteration, args.batch_size,
                                               world_size, collate_fn,
                                               distributed_run, local_rank,
                                               batch_to_gpu)

        if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
            save_checkpoint(model, optimizer, scaler, epoch, model_config,
                            args.amp, args.output, args.model_name,
                            local_rank, world_size)
        if local_rank == 0:
            DLLogger.flush()

    torch.cuda.synchronize()
    run_stop_time = time.perf_counter()
    run_time = run_stop_time - run_start_time
    DLLogger.log(step=tuple(), data={'run_time': run_time})
    DLLogger.log(step=tuple(), data={'val_loss': val_loss})
    DLLogger.log(step=tuple(), data={'train_items_per_sec':
                                     (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
    DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec})

    if local_rank == 0:
        DLLogger.flush()
Exemple #15
0
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])

    LOGGER.timed_block_start("run")
    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE)
    LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_items/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_items/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_loss",
                           metric_scope=dllg.EPOCH_SCOPE)

    log_hardware()

    # Restore training from checkpoint logic
    checkpoint = None
    start_epoch = 0

    model_name = args.model_name
    parser = models.parse_model_args(model_name, parser)
    parser.parse_args()

    args = parser.parse_args()

    log_args(args)

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    num_gpus = torch.cuda.device_count()
    print("gpus", num_gpus)
    distributed_run = num_gpus > 1
    if distributed_run:
        init_distributed(args, args.world_size, args.rank, args.group_name)

    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()

    # Restore training from checkpoint logic
    if args.restore_from:
        print('Restoring from {} checkpoint'.format(args.restore_from))
        checkpoint = torch.load(args.restore_from, map_location='cpu')
        start_epoch = checkpoint['epoch'] + 1
        model_config = checkpoint['config']
        model = models.get_model(model_name, model_config, to_cuda=True)

        new_state_dict = {}
        for key, value in checkpoint['state_dict'].items():
            new_key = key.replace('module.', '')
            new_state_dict[new_key] = value

        model_dict = new_state_dict
        if args.warm_start:
            ignore_layers = ['embedding.weight']
            print('Warm start')

            if len(ignore_layers) > 0:
                model_dict = {
                    k: v
                    for k, v in model_dict.items() if k not in ignore_layers
                }
                dummy_dict = model.state_dict()
                dummy_dict.update(model_dict)
                model_dict = dummy_dict

        model.load_state_dict(model_dict)
    else:
        model_config = models.get_model_config(model_name, args)
        model = models.get_model(model_name, model_config, to_cuda=True)
        print("model configured")
        #model.cuda(4)
    model.cuda()
    # if not args.amp_run and distributed_run:
    #     model = DDP(model ,delay_allreduce=True)
    #
    #

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    # Restore training from checkpoint logic
    if checkpoint and 'optimizer_state_dict' in checkpoint and not args.warm_start:  # TODO: think about this more
        print('Restoring optimizer state')
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.amp_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        print("amp initialized")

        model = DDP(model, delay_allreduce=True)
        print("ddpmodel")

    try:
        sigma = args.sigma
    except AttributeError:
        sigma = None

    print("train starting")
    criterion = loss_functions.get_loss_function(model_name, sigma)

    try:
        n_frames_per_step = args.n_frames_per_step
    except AttributeError:
        n_frames_per_step = None

    print("data loading start")
    collate_fn = data_functions.get_collate_function(model_name,
                                                     n_frames_per_step)
    trainset = data_functions.get_data_loader(model_name, args.training_files,
                                              args)

    train_sampler = DistributedSampler(trainset) if distributed_run else None
    print("train loader started")

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    valset = data_functions.get_data_loader(model_name, args.validation_files,
                                            args)

    batch_to_gpu = data_functions.get_batch_to_gpu(model_name)

    iteration = 0
    model.train()

    LOGGER.log(key=tags.TRAIN_LOOP)

    # Restore training from checkpoint logic
    if start_epoch >= args.epochs:
        print('Checkpoint epoch {} >= total epochs {}'.format(
            start_epoch, args.epochs))
    else:
        for epoch in range(start_epoch, args.epochs):
            LOGGER.epoch_start()
            epoch_start_time = time.time()
            LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)

            # used to calculate avg items/sec over epoch
            reduced_num_items_epoch = 0

            # used to calculate avg loss over epoch
            train_epoch_avg_loss = 0.0
            train_epoch_avg_items_per_sec = 0.0
            num_iters = 0

            # if overflow at the last iteration then do not save checkpoint
            overflow = False

            for i, batch in enumerate(train_loader):

                print("Batch: {}/{} epoch {}".format(i, len(train_loader),
                                                     epoch))
                LOGGER.iteration_start()
                iter_start_time = time.time()
                LOGGER.log(key=tags.TRAIN_ITER_START, value=i)

                start = time.perf_counter()
                adjust_learning_rate(epoch, optimizer, args.learning_rate,
                                     args.anneal_steps, args.anneal_factor)

                model.zero_grad()

                x, y, num_items = batch_to_gpu(batch)

                y_pred = model(x)

                loss = criterion(y_pred, y)

                if distributed_run:
                    reduced_loss = reduce_tensor(loss.data,
                                                 args.world_size).item()
                    reduced_num_items = reduce_tensor(num_items.data, 1).item()
                else:
                    reduced_loss = loss.item()
                    reduced_num_items = num_items.item()
                if np.isnan(reduced_loss):
                    raise Exception("loss is NaN")

                LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)

                train_epoch_avg_loss += reduced_loss
                num_iters += 1

                # accumulate number of items processed in this epoch
                reduced_num_items_epoch += reduced_num_items

                if args.amp_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    grad_norm = torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.grad_clip_thresh)
                else:
                    loss.backward()
                    grad_norm = torch.nn.utils.clip_grad_norm_(
                        model.parameters(), args.grad_clip_thresh)

                optimizer.step()

                iteration += 1

                LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)

                iter_stop_time = time.time()
                iter_time = iter_stop_time - iter_start_time
                items_per_sec = reduced_num_items / iter_time
                train_epoch_avg_items_per_sec += items_per_sec

                LOGGER.log(key="train_iter_items/sec", value=items_per_sec)
                LOGGER.log(key="iter_time", value=iter_time)
                LOGGER.iteration_stop()

            LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
            epoch_stop_time = time.time()
            epoch_time = epoch_stop_time - epoch_start_time

            LOGGER.log(key="train_epoch_items/sec",
                       value=(reduced_num_items_epoch / epoch_time))
            LOGGER.log(key="train_epoch_avg_items/sec",
                       value=(train_epoch_avg_items_per_sec /
                              num_iters if num_iters > 0 else 0.0))
            LOGGER.log(key="train_epoch_avg_loss",
                       value=(train_epoch_avg_loss /
                              num_iters if num_iters > 0 else 0.0))
            LOGGER.log(key="epoch_time", value=epoch_time)

            LOGGER.log(key=tags.EVAL_START, value=epoch)

            validate(model, criterion, valset, iteration, args.batch_size,
                     args.world_size, collate_fn, distributed_run, args.rank,
                     batch_to_gpu)

            LOGGER.log(key=tags.EVAL_STOP, value=epoch)

            if (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0:
                checkpoint_path = os.path.join(
                    args.output_directory,
                    "checkpoint_{}_{}".format(model_name, epoch))
                save_checkpoint(model, epoch, model_config, optimizer,
                                checkpoint_path)
                save_sample(
                    model_name, model, args.waveglow_checkpoint,
                    args.tacotron2_checkpoint, args.phrase_path,
                    os.path.join(
                        args.output_directory,
                        "sample_{}_{}.wav".format(model_name, iteration)),
                    args.sampling_rate)

            LOGGER.epoch_stop()

    run_stop_time = time.time()
    run_time = run_stop_time - run_start_time
    LOGGER.log(key="run_time", value=run_time)
    LOGGER.log(key=tags.RUN_FINAL)

    print("training time", run_stop_time - run_start_time)

    LOGGER.timed_block_stop("run")

    if args.rank == 0:
        LOGGER.finish()
def main():

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])

    LOGGER.timed_block_start("run")
    LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS,
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE)
    LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_items/sec",
                           metric_scope=dllg.EPOCH_SCOPE)
    LOGGER.register_metric("train_epoch_avg_loss",
                           metric_scope=dllg.EPOCH_SCOPE)

    log_hardware()

    model_name = args.model_name
    parser = models.parse_model_args(model_name, parser)
    parser.parse_args()

    args = parser.parse_args()

    log_args(args)

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    distributed_run = args.world_size > 1
    if distributed_run:
        init_distributed(args, args.world_size, args.rank, args.group_name)

    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()

    model_config = models.get_model_config(model_name, args)
    model = models.get_model(model_name,
                             model_config,
                             to_fp16=args.fp16_run,
                             to_cuda=True)

    epoch_start = 0
    if args.resume:
        resume_model_path = args.resume_tacotron2_path if args.model_name == "Tacotron2" else args.resume_waveglow_path
        checkpoint = torch.load(resume_model_path, map_location='cpu')
        epoch_start = checkpoint["epoch"]
        state_dict = checkpoint['state_dict']
        if checkpoint_from_distributed(state_dict):
            state_dict = unwrap_distributed(state_dict)

        model.load_state_dict(state_dict)
        print("restore model %s" % resume_model_path)

    if distributed_run:
        model = DDP(model)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    if args.fp16_run:
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=args.dynamic_loss_scaling)

    try:
        sigma = args.sigma
    except AttributeError:
        sigma = None

    criterion = loss_functions.get_loss_function(model_name, sigma)

    try:
        n_frames_per_step = args.n_frames_per_step
    except AttributeError:
        n_frames_per_step = None

    collate_fn = data_functions.get_collate_function(model_name,
                                                     n_frames_per_step)
    trainset = data_functions.get_data_loader(model_name, args.dataset_path,
                                              args.training_files, args)
    train_sampler = DistributedSampler(trainset) if distributed_run else None
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    valset = data_functions.get_data_loader(model_name, args.dataset_path,
                                            args.validation_files, args)

    batch_to_gpu = data_functions.get_batch_to_gpu(model_name)

    iteration = 0
    model.train()

    LOGGER.log(key=tags.TRAIN_LOOP)

    for epoch in range(epoch_start, args.epochs):
        LOGGER.epoch_start()
        epoch_start_time = time.time()
        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)

        # used to calculate avg items/sec over epoch
        reduced_num_items_epoch = 0

        # used to calculate avg loss over epoch
        train_epoch_avg_loss = 0.0
        num_iters = 0

        # if overflow at the last iteration then do not save checkpoint
        overflow = False

        for i, batch in enumerate(train_loader):
            LOGGER.iteration_start()
            iter_start_time = time.time()
            LOGGER.log(key=tags.TRAIN_ITER_START, value=i)
            print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch))

            start = time.perf_counter()
            adjust_learning_rate(epoch, optimizer, args.learning_rate,
                                 args.anneal_steps, args.anneal_factor)

            model.zero_grad()
            x, y, num_items = batch_to_gpu(batch)

            if args.fp16_run:
                y_pred = model(fp32_to_fp16(x))
                loss = criterion(fp16_to_fp32(y_pred), y)
            else:
                y_pred = model(x)
                loss = criterion(y_pred, y)

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
                reduced_num_items = reduce_tensor(num_items.data, 1).item()
            else:
                reduced_loss = loss.item()
                reduced_num_items = num_items.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss)

            train_epoch_avg_loss += reduced_loss
            num_iters += 1

            # accumulate number of items processed in this epoch
            reduced_num_items_epoch += reduced_num_items

            if args.fp16_run:
                optimizer.backward(loss)
                grad_norm = optimizer.clip_master_grads(args.grad_clip_thresh)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), args.grad_clip_thresh)

            optimizer.step()

            overflow = optimizer.overflow if args.fp16_run else False
            iteration += 1

            LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i)

            iter_stop_time = time.time()
            iter_time = iter_stop_time - iter_start_time
            LOGGER.log(key="train_iter_items/sec",
                       value=(reduced_num_items / iter_time))
            LOGGER.log(key="iter_time", value=iter_time)
            LOGGER.iteration_stop()

        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        epoch_stop_time = time.time()
        epoch_time = epoch_stop_time - epoch_start_time

        LOGGER.log(key="train_epoch_items/sec",
                   value=(reduced_num_items_epoch / epoch_time))
        LOGGER.log(key="train_epoch_avg_loss",
                   value=(train_epoch_avg_loss /
                          num_iters if num_iters > 0 else 0.0))
        LOGGER.log(key="epoch_time", value=epoch_time)

        LOGGER.log(key=tags.EVAL_START, value=epoch)

        validate(model, criterion, valset, iteration, args.batch_size,
                 args.world_size, collate_fn, distributed_run, args.rank,
                 batch_to_gpu, args.fp16_run)

        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

        if not overflow and (epoch % args.epochs_per_checkpoint
                             == 0) and args.rank == 0:
            checkpoint_path = os.path.join(
                args.output_directory,
                "checkpoint_{}_{}".format(model_name, epoch))
            save_checkpoint(model, epoch, model_config, checkpoint_path)
            save_sample(
                model_name, model, args.waveglow_checkpoint,
                args.tacotron2_checkpoint, args.phrase_path,
                os.path.join(args.output_directory,
                             "sample_{}_{}.wav".format(model_name, iteration)),
                args.sampling_rate, args.fp16_run)

        LOGGER.epoch_stop()

    run_stop_time = time.time()
    run_time = run_stop_time - run_start_time
    LOGGER.log(key="run_time", value=run_time)
    LOGGER.log(key=tags.RUN_FINAL)

    print("training time", run_stop_time - run_start_time)

    LOGGER.timed_block_stop("run")

    if args.rank == 0:
        LOGGER.finish()
def main():
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Training',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        local_rank = int(os.environ['LOCAL_RANK'])
        world_size = int(os.environ['WORLD_SIZE'])
    else:
        local_rank = args.rank
        world_size = args.world_size
    distributed_run = world_size > 1

    torch.manual_seed(args.seed + local_rank)
    np.random.seed(args.seed + local_rank)

    if local_rank == 0:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

        log_fpath = args.log_file or os.path.join(args.output, 'nvlog.json')
        log_fpath = unique_dllogger_fpath(log_fpath)
        init_dllogger(log_fpath)
    else:
        init_dllogger(dummy=True)

    [DLLogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]

    parser = models.parse_model_args('FastPitch', parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, world_size, local_rank)

    device = torch.device('cuda' if args.cuda else 'cpu')
    model_config = models.get_model_config('FastPitch', args)
    model = models.get_model('FastPitch', model_config, device)

    # Store pitch mean/std as params to translate from Hz during inference
    fpath = common.utils.stats_filename(args.dataset_path, args.training_files,
                                        'pitch_char')
    with open(args.pitch_mean_std_file, 'r') as f:
        stats = json.load(f)
    model.pitch_mean[0] = stats['mean']
    model.pitch_std[0] = stats['std']

    kw = dict(lr=args.learning_rate,
              betas=(0.9, 0.98),
              eps=1e-9,
              weight_decay=args.weight_decay)
    if args.optimizer == 'adam':
        optimizer = FusedAdam(model.parameters(), **kw)
    elif args.optimizer == 'lamb':
        optimizer = FusedLAMB(model.parameters(), **kw)
    else:
        raise ValueError

    if args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if args.ema_decay > 0:
        ema_model = copy.deepcopy(model)
    else:
        ema_model = None

    if distributed_run:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank,
                                        find_unused_parameters=True)

    start_epoch = [1]
    start_iter = [0]

    assert args.checkpoint_path is None or args.resume is False, (
        "Specify a single checkpoint source")
    if args.checkpoint_path is not None:
        ch_fpath = args.checkpoint_path
    elif args.resume:
        ch_fpath = last_checkpoint(args.output)
    else:
        ch_fpath = None

    if ch_fpath is not None:
        load_checkpoint(local_rank, model, ema_model, optimizer, start_epoch,
                        start_iter, model_config, args.amp, ch_fpath,
                        world_size)

    start_epoch = start_epoch[0]
    total_iter = start_iter[0]

    criterion = loss_functions.get_loss_function(
        'FastPitch',
        dur_predictor_loss_scale=args.dur_predictor_loss_scale,
        pitch_predictor_loss_scale=args.pitch_predictor_loss_scale)

    collate_fn = data_functions.get_collate_function('FastPitch')
    trainset = data_functions.get_data_loader('FastPitch', args.dataset_path,
                                              args.training_files, args)
    valset = data_functions.get_data_loader('FastPitch', args.dataset_path,
                                            args.validation_files, args)
    if distributed_run:
        train_sampler, shuffle = DistributedSampler(trainset), False
    else:
        train_sampler, shuffle = None, True

    train_loader = DataLoader(trainset,
                              num_workers=16,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    batch_to_gpu = data_functions.get_batch_to_gpu('FastPitch')

    model.train()

    train_tblogger = TBLogger(local_rank, args.output, 'train')
    val_tblogger = TBLogger(local_rank, args.output, 'val', dummies=True)
    if args.ema_decay > 0:
        val_ema_tblogger = TBLogger(local_rank, args.output, 'val_ema')

    val_loss = 0.0
    torch.cuda.synchronize()
    for epoch in range(start_epoch, args.epochs + 1):
        epoch_start_time = time.time()

        epoch_loss = 0.0
        epoch_mel_loss = 0.0
        epoch_num_frames = 0
        epoch_frames_per_sec = 0.0

        if distributed_run:
            train_loader.sampler.set_epoch(epoch)

        accumulated_steps = 0
        iter_loss = 0
        iter_num_frames = 0
        iter_meta = {}

        epoch_iter = 0
        num_iters = len(train_loader) // args.gradient_accumulation_steps
        for batch in train_loader:
            if accumulated_steps == 0:
                if epoch_iter == num_iters:
                    break
                total_iter += 1
                epoch_iter += 1
                iter_start_time = time.time()
                start = time.perf_counter()

                old_lr = optimizer.param_groups[0]['lr']
                adjust_learning_rate(total_iter, optimizer, args.learning_rate,
                                     args.warmup_steps)
                new_lr = optimizer.param_groups[0]['lr']

                if new_lr != old_lr:
                    dllog_lrate_change = f'{old_lr:.2E} -> {new_lr:.2E}'
                    train_tblogger.log_value(total_iter, 'lrate', new_lr)
                else:
                    dllog_lrate_change = None

                model.zero_grad()

            x, y, num_frames = batch_to_gpu(batch)
            y_pred = model(x, use_gt_durations=True)
            loss, meta = criterion(y_pred, y)

            loss /= args.gradient_accumulation_steps
            meta = {
                k: v / args.gradient_accumulation_steps
                for k, v in meta.items()
            }

            if args.amp:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, world_size).item()
                reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
                meta = {
                    k: reduce_tensor(v, world_size)
                    for k, v in meta.items()
                }
            else:
                reduced_loss = loss.item()
                reduced_num_frames = num_frames.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            accumulated_steps += 1
            iter_loss += reduced_loss
            iter_num_frames += reduced_num_frames
            iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta}

            if accumulated_steps % args.gradient_accumulation_steps == 0:

                train_tblogger.log_grads(total_iter, model)
                if args.amp:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.grad_clip_thresh)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_thresh)

                optimizer.step()
                apply_ema_decay(model, ema_model, args.ema_decay)

                iter_stop_time = time.time()
                iter_time = iter_stop_time - iter_start_time
                frames_per_sec = iter_num_frames / iter_time
                epoch_frames_per_sec += frames_per_sec
                epoch_loss += iter_loss
                epoch_num_frames += iter_num_frames
                iter_mel_loss = iter_meta['mel_loss'].item()
                epoch_mel_loss += iter_mel_loss

                DLLogger.log(
                    (epoch, epoch_iter, num_iters),
                    OrderedDict([('train_loss', iter_loss),
                                 ('train_mel_loss', iter_mel_loss),
                                 ('train_frames/s', frames_per_sec),
                                 ('took', iter_time),
                                 ('lrate_change', dllog_lrate_change)]))
                train_tblogger.log_meta(total_iter, iter_meta)

                accumulated_steps = 0
                iter_loss = 0
                iter_num_frames = 0
                iter_meta = {}

        # Finished epoch
        epoch_stop_time = time.time()
        epoch_time = epoch_stop_time - epoch_start_time

        DLLogger.log((epoch, ),
                     data=OrderedDict([
                         ('avg_train_loss', epoch_loss / epoch_iter),
                         ('avg_train_mel_loss', epoch_mel_loss / epoch_iter),
                         ('avg_train_frames/s', epoch_num_frames / epoch_time),
                         ('took', epoch_time)
                     ]))

        tik = time.time()
        val_loss, meta, num_frames = validate(model,
                                              criterion,
                                              valset,
                                              args.batch_size,
                                              world_size,
                                              collate_fn,
                                              distributed_run,
                                              local_rank,
                                              batch_to_gpu,
                                              use_gt_durations=True)
        tok = time.time()

        DLLogger.log((epoch, ),
                     data=OrderedDict([
                         ('val_loss', val_loss),
                         ('val_mel_loss', meta['mel_loss'].item()),
                         ('val_frames/s', num_frames / (tok - tik)),
                         ('took', tok - tik),
                     ]))
        val_tblogger.log_meta(total_iter, meta)

        if args.ema_decay > 0:
            tik_e = time.time()
            val_loss_e, meta_e, num_frames_e = validate(ema_model,
                                                        criterion,
                                                        valset,
                                                        args.batch_size,
                                                        world_size,
                                                        collate_fn,
                                                        distributed_run,
                                                        local_rank,
                                                        batch_to_gpu,
                                                        use_gt_durations=True)
            tok_e = time.time()

            DLLogger.log(
                (epoch, ),
                data=OrderedDict([
                    ('val_ema_loss', val_loss_e),
                    ('val_ema_mel_loss', meta_e['mel_loss'].item()),
                    ('val_ema_frames/s', num_frames_e / (tok_e - tik_e)),
                    ('took', tok_e - tik_e),
                ]))
            val_ema_tblogger.log_meta(total_iter, meta)

        if (epoch > 0 and args.epochs_per_checkpoint > 0
                and (epoch % args.epochs_per_checkpoint == 0)
                and local_rank == 0):

            checkpoint_path = os.path.join(args.output,
                                           f"FastPitch_checkpoint_{epoch}.pt")
            save_checkpoint(local_rank, model, ema_model, optimizer, epoch,
                            total_iter, model_config, args.amp,
                            checkpoint_path)
        if local_rank == 0:
            DLLogger.flush()

    # Finished training
    DLLogger.log((),
                 data=OrderedDict([
                     ('avg_train_loss', epoch_loss / epoch_iter),
                     ('avg_train_mel_loss', epoch_mel_loss / epoch_iter),
                     ('avg_train_frames/s', epoch_num_frames / epoch_time),
                 ]))
    DLLogger.log((),
                 data=OrderedDict([
                     ('val_loss', val_loss),
                     ('val_mel_loss', meta['mel_loss'].item()),
                     ('val_frames/s', num_frames / (tok - tik)),
                 ]))
    if local_rank == 0:
        DLLogger.flush()
Exemple #18
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch FastPitch Training',
                                     allow_abbrev=False)
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    distributed_run = args.world_size > 1

    torch.manual_seed(args.seed + args.local_rank)
    np.random.seed(args.seed + args.local_rank)

    if args.local_rank == 0:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

    log_fpath = args.log_file or os.path.join(args.output, 'nvlog.json')
    tb_subsets = ['train', 'val']
    if args.ema_decay > 0.0:
        tb_subsets.append('val_ema')

    logger.init(log_fpath,
                args.output,
                enabled=(args.local_rank == 0),
                tb_subsets=tb_subsets)
    logger.parameters(vars(args), tb_subset='train')

    parser = models.parse_model_args('FastPitch', parser)
    args, unk_args = parser.parse_known_args()
    if len(unk_args) > 0:
        raise ValueError(f'Invalid options {unk_args}')

    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, args.world_size, args.local_rank)

    device = torch.device('cuda' if args.cuda else 'cpu')
    model_config = models.get_model_config('FastPitch', args)
    model = models.get_model('FastPitch', model_config, device)

    # Store pitch mean/std as params to translate from Hz during inference
    with open(args.pitch_mean_std_file, 'r') as f:
        stats = json.load(f)
    model.pitch_mean[0] = stats['mean']
    model.pitch_std[0] = stats['std']

    kw = dict(lr=args.learning_rate,
              betas=(0.9, 0.98),
              eps=1e-9,
              weight_decay=args.weight_decay)
    if args.optimizer == 'adam':
        optimizer = FusedAdam(model.parameters(), **kw)
    elif args.optimizer == 'lamb':
        optimizer = FusedLAMB(model.parameters(), **kw)
    else:
        raise ValueError

    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)

    #if args.amp:
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    if args.ema_decay > 0:
        ema_model = copy.deepcopy(model)
    else:
        ema_model = None

    if distributed_run:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank,
                                        find_unused_parameters=True)

    start_epoch = [1]
    start_iter = [0]

    assert args.checkpoint_path is None or args.resume is False, (
        "Specify a single checkpoint source")
    if args.checkpoint_path is not None:
        ch_fpath = args.checkpoint_path
    elif args.resume:
        ch_fpath = last_checkpoint(args.output)
    else:
        ch_fpath = None

    if ch_fpath is not None:
        load_checkpoint(args.local_rank, model, ema_model, optimizer,
                        start_epoch, start_iter, model_config, args.amp,
                        ch_fpath, args.world_size)

    start_epoch = start_epoch[0]
    total_iter = start_iter[0]

    criterion = loss_functions.get_loss_function(
        'FastPitch',
        dur_predictor_loss_scale=args.dur_predictor_loss_scale,
        pitch_predictor_loss_scale=args.pitch_predictor_loss_scale)

    collate_fn = data_functions.get_collate_function('FastPitch')
    trainset = data_functions.get_data_loader('FastPitch', args.dataset_path,
                                              args.training_files, args)
    valset = data_functions.get_data_loader('FastPitch', args.dataset_path,
                                            args.validation_files, args)
    if distributed_run:
        train_sampler, shuffle = DistributedSampler(trainset), False
    else:
        train_sampler, shuffle = None, True

    train_loader = DataLoader(trainset,
                              num_workers=16,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=args.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    batch_to_gpu = data_functions.get_batch_to_gpu('FastPitch')

    model.train()

    torch.cuda.synchronize()
    for epoch in range(start_epoch, args.epochs + 1):
        epoch_start_time = time.perf_counter()

        epoch_loss = 0.0
        epoch_mel_loss = 0.0
        epoch_num_frames = 0
        epoch_frames_per_sec = 0.0

        if distributed_run:
            train_loader.sampler.set_epoch(epoch)

        accumulated_steps = 0
        iter_loss = 0
        iter_num_frames = 0
        iter_meta = {}

        epoch_iter = 0
        num_iters = len(train_loader) // args.gradient_accumulation_steps
        for batch in train_loader:

            if accumulated_steps == 0:
                if epoch_iter == num_iters:
                    break
                total_iter += 1
                epoch_iter += 1
                iter_start_time = time.perf_counter()

                adjust_learning_rate(total_iter, optimizer, args.learning_rate,
                                     args.warmup_steps)

                model.zero_grad()

            x, y, num_frames = batch_to_gpu(batch)

            #AMP upstream autocast
            with torch.cuda.amp.autocast(enabled=args.amp):
                y_pred = model(x, use_gt_durations=True)
                loss, meta = criterion(y_pred, y)

                loss /= args.gradient_accumulation_steps
            meta = {
                k: v / args.gradient_accumulation_steps
                for k, v in meta.items()
            }

            if args.amp:
                #with amp.scale_loss(loss, optimizer) as scaled_loss:
                #scaled_loss.backward()
                scaler.scale(loss).backward()
            else:
                loss.backward()

            if distributed_run:
                reduced_loss = reduce_tensor(loss.data, args.world_size).item()
                reduced_num_frames = reduce_tensor(num_frames.data, 1).item()
                meta = {
                    k: reduce_tensor(v, args.world_size)
                    for k, v in meta.items()
                }
            else:
                reduced_loss = loss.item()
                reduced_num_frames = num_frames.item()
            if np.isnan(reduced_loss):
                raise Exception("loss is NaN")

            accumulated_steps += 1
            iter_loss += reduced_loss
            iter_num_frames += reduced_num_frames
            iter_meta = {k: iter_meta.get(k, 0) + meta.get(k, 0) for k in meta}

            if accumulated_steps % args.gradient_accumulation_steps == 0:

                logger.log_grads_tb(total_iter, model)
                if args.amp:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_thresh)
                    scaler.step(optimizer)
                    scaler.update()
                    #optimizer.zero_grad(set_to_none=True)
                    optimizer.zero_grad()
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.grad_clip_thresh)

                    optimizer.step()
                apply_ema_decay(model, ema_model, args.ema_decay)

                iter_time = time.perf_counter() - iter_start_time
                iter_mel_loss = iter_meta['mel_loss'].item()
                epoch_frames_per_sec += iter_num_frames / iter_time
                epoch_loss += iter_loss
                epoch_num_frames += iter_num_frames
                epoch_mel_loss += iter_mel_loss

                logger.log(
                    (epoch, epoch_iter, num_iters),
                    tb_total_steps=total_iter,
                    subset='train',
                    data=OrderedDict([
                        ('loss', iter_loss), ('mel_loss', iter_mel_loss),
                        ('frames/s', iter_num_frames / iter_time),
                        ('took', iter_time),
                        ('lrate', optimizer.param_groups[0]['lr'])
                    ]),
                )

                accumulated_steps = 0
                iter_loss = 0
                iter_num_frames = 0
                iter_meta = {}

        # Finished epoch
        epoch_time = time.perf_counter() - epoch_start_time

        logger.log(
            (epoch, ),
            tb_total_steps=None,
            subset='train_avg',
            data=OrderedDict([('loss', epoch_loss / epoch_iter),
                              ('mel_loss', epoch_mel_loss / epoch_iter),
                              ('frames/s', epoch_num_frames / epoch_time),
                              ('took', epoch_time)]),
        )

        validate(model,
                 epoch,
                 total_iter,
                 criterion,
                 valset,
                 args.batch_size,
                 collate_fn,
                 distributed_run,
                 batch_to_gpu,
                 use_gt_durations=True)

        if args.ema_decay > 0:
            validate(ema_model,
                     epoch,
                     total_iter,
                     criterion,
                     valset,
                     args.batch_size,
                     collate_fn,
                     distributed_run,
                     batch_to_gpu,
                     use_gt_durations=True,
                     ema=True)

        if (epoch > 0 and args.epochs_per_checkpoint > 0
                and (epoch % args.epochs_per_checkpoint == 0)
                and args.local_rank == 0):

            checkpoint_path = os.path.join(args.output,
                                           f"FastPitch_checkpoint_{epoch}.pt")
            save_checkpoint(args.local_rank, model, ema_model, optimizer,
                            scaler, epoch, total_iter, model_config, args.amp,
                            checkpoint_path)
        logger.flush()

    # Finished training
    logger.log(
        (),
        tb_total_steps=None,
        subset='train_avg',
        data=OrderedDict([('loss', epoch_loss / epoch_iter),
                          ('mel_loss', epoch_mel_loss / epoch_iter),
                          ('frames/s', epoch_num_frames / epoch_time),
                          ('took', epoch_time)]),
    )
    validate(model,
             None,
             total_iter,
             criterion,
             valset,
             args.batch_size,
             collate_fn,
             distributed_run,
             batch_to_gpu,
             use_gt_durations=True)

    if (epoch > 0 and args.epochs_per_checkpoint > 0
            and (epoch % args.epochs_per_checkpoint != 0)
            and args.local_rank == 0):
        checkpoint_path = os.path.join(args.output,
                                       f"FastPitch_checkpoint_{epoch}.pt")
        save_checkpoint(args.local_rank, model, ema_model, optimizer, scaler,
                        epoch, total_iter, model_config, args.amp,
                        checkpoint_path)
Exemple #19
0
        'Do you want to remove validation images([y]/n)?\n')

    # Del_val_from_train is set to True by default
    if del_val_from_train == 'n':
        del_val_from_train = False
    else:
        del_val_from_train = True
    (x_train, y_train), (x_val, y_val) = get_train_data(del_val_from_train)

    # Finds the image shape, and passes it to the model for reshape, initial image shape is unimportant
    image_shape = [1, 1]
    image_shape[0] = x_train.shape[1]
    image_shape[1] = x_train.shape[2]
    image_shape = tuple(image_shape)

    # Get model configuration
    model_config = get_model_config(model_name)

    # Runs model
    choose_training = input('Choose training type: (normal/[recursive])')
    if choose_training == 'normal':
        scores, model = training(model_config['model_builder'], x_train,
                                 y_train, x_val, y_val, image_shape)
    else:
        scores, model = recursive_training(model_config['model_builder'],
                                           x_train, y_train, x_val, y_val,
                                           image_shape)

    # Saves model
    save_model(model, model_config)
Exemple #20
0
    a = []
    img = skio.imread(file_location)
    img = resize(img, (16, 8))
    img = img.tolist()
    a.append(img)
    img = np.asarray(a)
    x_test = img

    # Finds confidence of all 26 alphabets
    prediction = model.predict(x_test, batch_size=32, verbose=0)
    result = np.argmax(prediction, axis=1)
    result = result.tolist()
    for i in prediction:
        confidence = prediction[0][result]

    result_alphabet = [chr(int(i) + ord('a')) for i in result]
    confidence = Decimal(confidence[0] * 100)

    confidence = Decimal(
        confidence.quantize(Decimal('.01'), rounding=ROUND_HALF_UP))
    return result_alphabet[0], confidence


if __name__ == '__main__':
    file_location = ask_for_file_particulars()
    model_config = get_model_config('larger_CNN')
    result_alphabet, confidence = predict(model_config, file_location)

    print('The model predicts alphabet: {} with {}% confidence'.format(
        result_alphabet, confidence))
Exemple #21
0
def main():
    """Training of Tacotron2 or WaveGlow model.
    """

    parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training')
    parser = parse_args(parser)
    args, _ = parser.parse_known_args()

    if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        local_rank = int(os.environ['LOCAL_RANK'])
        world_size = int(os.environ['WORLD_SIZE'])
    else:
        local_rank = args.rank
        world_size = args.world_size

    distributed_run = world_size > 1

    if local_rank == 0:
        if not os.path.exists(args.output):
            os.makedirs(args.output)

        log_fpath = args.log_file or os.path.join(args.output, 'nvlog.json')
        log_fpath = unique_dllogger_fpath(log_fpath)
        init_dllogger(log_fpath)
    else:
        init_dllogger(dummy=True)

    for k, v in vars(args).items():
        DLLogger.log(step="PARAMETER", data={k:v})
    # DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})

    model_name = args.model_name
    parser = models.parse_model_args(model_name, parser)
    args, _ = parser.parse_known_args()

    torch.backends.cudnn.enabled = args.cudnn_enabled
    torch.backends.cudnn.benchmark = args.cudnn_benchmark

    if distributed_run:
        init_distributed(args, world_size, local_rank, args.group_name)

    torch.cuda.synchronize()
    run_start_time = time.perf_counter()

    model_config = models.get_model_config(model_name, args)
    model = models.get_model(model_name, model_config, cpu_run=False,
        uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight)

    if not args.amp and distributed_run:
        model = DDP(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                                 weight_decay=args.weight_decay)

    if args.amp:
        # apex.amp.initialize(models, optimizers=None, enabled=True, opt_level='O1', 
        # cast_model_type=None, patch_torch_functions=None, keep_batchnorm_fp32=None, 
        # master_weights=None, loss_scale=None, cast_model_outputs=None, num_losses=1, 
        # verbosity=1, min_loss_scale=None, max_loss_scale=16777216.0)
        # https://nvidia.github.io/apex/amp.html#module-apex.amp
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
        if distributed_run:
            model = DDP(model)

    try:
        sigma = args.sigma
    except AttributeError:G
        sigma = None