Beispiel #1
0
def prepare_dataloaders(hparams, epoch=0, valset=None, collate_fn=None):
    # Get data, data loaders and collate function ready

    # prepare train set
    print('preparing train set for epoch {}'.format(epoch))
    shuffle_train = {'shuffle-audiopath': hparams.shuffle_audiopaths,
        'shuffle-batch': hparams.shuffle_batches, 'permute-opt': hparams.permute_opt,
        'pre-batching': hparams.pre_batching}
    trainset = TextMelLoader(hparams.training_files, shuffle_train,
                             hparams, epoch)
    #print('\n'.join(['{}, {}'.format(line[0],line[2]) for line in \
    #                 trainset.audiopaths_and_text[:5]]))
    if valset is None:
        # prepare val set (different shuffle plan compared with train set)
        print('preparing val set for epoch {}'.format(epoch))
        shuffle_val = {'shuffle-audiopath': hparams.shuffle_audiopaths,
            'shuffle-batch': False, 'permute-opt': 'rand', 'pre-batching': False}
        valset = TextMelLoader(hparams.validation_files, shuffle_val, hparams)
    if collate_fn is None:
        collate_fn = {'train': TextMelCollate(hparams, pre_batching=hparams.pre_batching),
                      'val': TextMelCollate(hparams, pre_batching=False)}

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=hparams.shuffle_samples)
    else:
        train_sampler = None

    shuffle = (train_sampler is None) and hparams.shuffle_samples
    batch_size = 1 if hparams.pre_batching else hparams.batch_size
    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
        sampler=train_sampler, batch_size=batch_size, pin_memory=False,
        drop_last=True, collate_fn=collate_fn['train'])
    return train_loader, valset, collate_fn
Beispiel #2
0
def prepare_dataloaders(hparams, audio_offset=0):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             TBPTT=False,
                             check_files=False,
                             verbose=True,
                             audio_offset=audio_offset)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=False)
        shuffle = False
    else:
        train_sampler = None
        shuffle = False

    train_loader = DataLoader(
        trainset,
        num_workers=0,
        shuffle=shuffle,
        sampler=train_sampler,
        batch_size=hparams.batch_size,
        pin_memory=
        False,  # default pin_memory=False, True should allow async memory transfers # Causes very random CUDA errors (after like 4+ hours)
        drop_last=True,
        collate_fn=collate_fn)
    return train_loader, None, collate_fn, train_sampler, trainset
Beispiel #3
0
def prepare_dataloaders(hparams, saved_lookup):
    # Get data, data loaders and collate function ready
    speaker_ids = saved_lookup if hparams.use_saved_speakers else None
    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             check_files=hparams.check_files,
                             shuffle=False,
                             speaker_ids=speaker_ids)
    valset = TextMelLoader(hparams.validation_files,
                           hparams,
                           check_files=hparams.check_files,
                           shuffle=False,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate()

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=False)  #True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = False  #True

    train_loader = DataLoader(trainset,
                              num_workers=num_workers_,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler, trainset
Beispiel #4
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    # valset = TextMelLoader(hparams.validation_files, hparams)
    valset = None
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    # train_sampler = DistributedSampler(trainset) \
    #     if hparams.distributed_run else None
    #
    # train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
    #                           sampler=train_sampler,
    #                           batch_size=hparams.batch_size, pin_memory=False,
    #                           drop_last=True, collate_fn=collate_fn)
    # return train_loader, valset, collate_fn

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Beispiel #5
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams, warp_set="og")
    trainset_aug_time = TextMelLoader(hparams.training_files, hparams, warp_set="time")
    trainset_aug_freq = TextMelLoader(hparams.training_files, hparams, warp_set="freq")

    valset = TextMelLoader(hparams.validation_files, hparams, warp_set="og")
    valset_aug_time = TextMelLoader(hparams.validation_files, hparams, warp_set="time")
    valset_aug_freq = TextMelLoader(hparams.validation_files, hparams, warp_set="freq")

    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_aug_set = torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq])
    train_sampler = DistributedSampler(train_aug_set) if hparams.distributed_run else None

    if hparams.distributed_run:
        train_loader = DataLoader(train_aug_set,
                                  num_workers=35, shuffle=False,
                                  sampler=train_sampler,
                                  batch_size=hparams.batch_size, pin_memory=False,
                                  drop_last=True, collate_fn=collate_fn)
    else:
        train_loader = DataLoader(torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq]), num_workers=35, shuffle=True,
                                  sampler=None,
                                  batch_size=hparams.batch_size, pin_memory=False,
                                  drop_last=True, collate_fn=collate_fn)

    del trainset, trainset_aug_time, trainset_aug_freq

    return train_loader, valset, valset_aug_time, valset_aug_freq, collate_fn
def prepare_dataloaders(hparams, output_directory):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             output_directory=output_directory)
    valset = TextMelLoader(hparams.validation_files,
                           hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
def prepare_single_dataloaders(hparams, output_directory):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(
        'filelists/grapheme/grapheme_selvas_main_train.txt',
        hparams,
        output_directory=output_directory)
    # debugging purpose
    # trainset = TextMelLoader('filelists/selvas_main_valid.txt', hparams, output_directory=output_directory)
    valset = TextMelLoader('filelists/grapheme/main_valid_and_test.txt',
                           hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Beispiel #8
0
def prepare_dataloaders(input_directory, hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(os.path.join(input_directory, 'train.txt'),
                             hparams,
                             mode=hparams.train_mode)
    valset = TextMelLoader(os.path.join(input_directory, 'validation.txt'),
                           hparams,
                           speaker_ids=trainset.speaker_ids,
                           mode=hparams.train_mode)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=hparams.dataloader_num_workers,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Beispiel #9
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    warp_trainset = TextMelLoader(hparams.training_files,
                                  hparams,
                                  warp_set=True)

    valset = TextMelLoader(hparams.validation_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None

    train_loader = DataLoader(torch.utils.data.ConcatDataset(
        [trainset, warp_trainset]),
                              num_workers=1,
                              shuffle=True,
                              sampler=None,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    # train_loader = DataLoader(trainset, num_workers=1, shuffle=True,
    #                           sampler=train_sampler,
    #                           batch_size=hparams.batch_size, pin_memory=False,
    #                           drop_last=True, collate_fn=collate_fn)

    return train_loader, valset, collate_fn
Beispiel #10
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files,
                             hparams.polyphone_dict_files,
                             hparams.mask_dict_files, hparams)
    valset = TextMelLoader(hparams.validation_files,
                           hparams.polyphone_dict_files,
                           hparams.mask_dict_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step,
                                hparams.n_pinyin_symbols)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Beispiel #11
0
def style_transfer_v2():
    audio_paths_ = 'data/examples_filelist_v2.txt'
    dataloader_ = TextMelLoader(audio_paths_, hparams)
    datacollate_ = TextMelCollate(1)
    ## Load data
    # for file_idx in range(10):
    #     audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    #     print(dict(file_idx=file_idx, audio_path=audio_path, text=text))

    file_idx = 8
    audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid))

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    pitch_contour = dataloader_[file_idx][3][None].cuda()
    mel = load_mel(audio_path)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
    t0 = time.time()
    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)

    t0 = time.time()
    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)
Beispiel #12
0
def train_and_eval(rank, n_gpus, hps):
  global global_step
  if rank == 0:
    logger = utils.get_logger(hps.model_dir)
    logger.info(hps)
    utils.check_git_hash(hps.model_dir)
    writer = SummaryWriter(log_dir=hps.model_dir)
    writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))

  dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
  torch.manual_seed(hps.train.seed)
  torch.cuda.set_device(rank)

  train_dataset = TextMelLoader(hps.data.training_files, hps.data)
  train_sampler = torch.utils.data.distributed.DistributedSampler(
      train_dataset,
      num_replicas=n_gpus,
      rank=rank,
      shuffle=True)
  collate_fn = TextMelCollate(1)
  train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False,
      batch_size=hps.train.batch_size, pin_memory=True,
      drop_last=True, collate_fn=collate_fn, sampler=train_sampler)
  if rank == 0:
    val_dataset = TextMelLoader(hps.data.validation_files, hps.data)
    val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False,
        batch_size=hps.train.batch_size, pin_memory=True,
        drop_last=True, collate_fn=collate_fn)

  generator = models.FlowGenerator(
      n_vocab=len(symbols), 
      out_channels=hps.data.n_mel_channels, 
      **hps.model).cuda(rank)
  optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
  if hps.train.fp16_run:
    generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1")
  generator = DDP(generator)
  epoch_str = 1
  global_step = 0
  try:
    _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g)
    epoch_str += 1
    optimizer_g.step_num = (epoch_str - 1) * len(train_loader)
    optimizer_g._update_learning_rate()
    global_step = (epoch_str - 1) * len(train_loader)
  except:
    if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")):
      _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g)
  
  for epoch in range(epoch_str, hps.train.epochs + 1):
    if rank==0:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer)
      evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval)
      if epoch%50 == 0:
        utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch)))
    else:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
Beispiel #13
0
def load_dataloader(hparams, audio_path):
    if not hparams.episodic_training:
        dataloader = TextMelLoader(audio_path, hparams)
        datacollate = TextMelCollate(1)
    else:
        dataloader = EpisodicLoader(audio_path, hparams)
        datacollate = EpisodicCollater(1, hparams)
    
    return dataloader, datacollate
Beispiel #14
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = train_ema()
    valset = test_ema()


    collate_fn = TextMelCollate(hparams.n_frames_per_step)
    train_loader = DataLoader(trainset, num_workers=1, shuffle=False, batch_size = hparams.batch_size, pin_memory=False,collate_fn=collate_fn)


    return train_loader, valset, collate_fn
def synthesize2(model,
                audio_path,
                text,
                source_speaker_id,
                target_speaker_id,
                outname="sample.wav"):
    tacotron, waveglow, denoiser = model
    with open('temp.txt', 'w') as f:
        f.write(f"{audio_path}|{text}|{source_speaker_id}")
    arpabet_dict = cmudict.CMUDict('mellotron/data/cmu_dictionary')
    hparams = create_hparams()
    dataloader = TextMelLoader("temp.txt", hparams)
    datacollate = TextMelCollate(1)

    file_idx = 0
    audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cpu()
    pitch_contour = dataloader[file_idx][3][None].cpu()
    mel = load_mel(audio_path)
    print(audio_path, text)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]]))

    # For changing the pitch
    pitch_contour2 = pitch_contour.data.cpu().numpy().copy()
    #pitch_contour2[pitch_contour2 > 0] -= 45.
    #pitch_contour2[pitch_contour2 > 0] = 150.
    pitch_contour2 = torch.Tensor(pitch_contour2).cpu()

    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)

    speaker_id = torch.LongTensor([target_speaker_id]).cpu()

    sampling_rate = 22050

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour2, rhythm))
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.66),
                         0.03)[0, 0]
        audio = audio.cpu().numpy()
        pan = 0
        audio = panner(audio, pan)
        write(outname, sampling_rate, audio)
    os.remove("temp.txt")
Beispiel #16
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams['training_files'], hparams)
    valset = TextMelLoader(hparams['validation_files'], hparams)
    collate_fn = TextMelCollate(hparams['n_frames_per_step'])

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=True,
                              sampler=None,
                              batch_size=hparams['batch_size'],
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Beispiel #17
0
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    hparams = create_hparams()
    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    testset = TextMelLoader(text_files, hparams)
    collate_fn = TextMelCollate()

    test_loader = DataLoader(testset,
                             num_workers=0,
                             shuffle=False,
                             sampler=None,
                             batch_size=1,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=collate_fn)
    waveglow = torch.load(waveglow_path)['model']
    # waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, batch in enumerate(test_loader):
        text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
            batch)
        enc_outputs, _ = Taco2(
            (text_padded, input_lengths, mel_padded, max_len, output_lengths))
        # mel = torch.autograd.Variable(mel.cuda())
        # mel = torch.unsqueeze(mel, 0)
        # mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma)
            '''if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE'''
        # audio = audio.squeeze()
        # mel = mel.cpu().numpy()
        # audio = audio.astype('int16')
        print(mel)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        print(mel_path)
Beispiel #18
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_lst, hparams)
    valset = TextMelLoader(hparams.validation_lst, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Beispiel #19
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelSet(hparams.training_files, hparams)
    valset = TextMelSet(hparams.validation_files, hparams)
    collate_fn = TextMelCollate()

    train_loader = DataLoader(trainset,
                              num_workers=3,
                              shuffle=True,
                              batch_size=hparams.batch_size,
                              drop_last=True,
                              collate_fn=collate_fn)

    val_loader = DataLoader(valset,
                            batch_size=hparams.batch_size,
                            collate_fn=collate_fn)

    return train_loader, val_loader, collate_fn
Beispiel #20
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(
        hparams.training_files, hparams
    )  # trainset.__getitem__(index) = (text, mel), text in [num_char], mel in [num_mel, ceil((len(audio)+1)/hop_length)]
    valset = TextMelLoader(hparams.validation_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)  #
    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, trainset
Beispiel #21
0
def main():
    hps = utils.get_hparams()
    logger = utils.get_logger(hps.model_dir)
    logger.info(hps)
    utils.check_git_hash(hps.model_dir)

    torch.manual_seed(hps.train.seed)

    train_dataset = TextMelLoader(hps.data.training_files, hps.data)
    collate_fn = TextMelCollate(1)
    train_loader = DataLoader(train_dataset,
                              num_workers=8,
                              shuffle=True,
                              batch_size=hps.train.batch_size,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate_fn)

    generator = FlowGenerator_DDI(speaker_dim=hps.model.speaker_embedding,
                                  n_vocab=len(symbols),
                                  out_channels=hps.data.n_mel_channels,
                                  **hps.model).cuda()
    optimizer_g = commons.Adam(generator.parameters(),
                               scheduler=hps.train.scheduler,
                               dim_model=hps.model.hidden_channels,
                               warmup_steps=hps.train.warmup_steps,
                               lr=hps.train.learning_rate,
                               betas=hps.train.betas,
                               eps=hps.train.eps)

    generator.train()
    for batch_idx, (x, x_lengths, y, y_lengths,
                    speaker_embedding) in enumerate(train_loader):
        x, x_lengths = x.cuda(), x_lengths.cuda()
        y, y_lengths = y.cuda(), y_lengths.cuda()
        speaker_embedding = speaker_embedding.cuda()

        _ = generator(x, x_lengths, speaker_embedding, y, y_lengths, gen=False)
        break

    utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0,
                          os.path.join(hps.model_dir, "ddi_G.pth"))
Beispiel #22
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader('train', hparams)
    valset = TextMelLoader('val', hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
    else:
        l = trainset.get_lengths()
        train_sampler = PartialyRandomizedSimilarTimeLengthSampler(
            l, batch_size=hparams.batch_size * (sum(l) / len(l)))

    train_loader = DataLoader(trainset, num_workers=24,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=1,
                              pin_memory=False,
                              drop_last=False,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Beispiel #23
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    ## if not 用法:   if true才执行  (即 if not false)
    if not hparams.load_mel_from_disk:
        trainset = TextMelLoader(hparams.training_files,
                                 hparams.polyphone_dict_files,
                                 hparams.mask_dict_files, hparams)
        valset = TextMelLoader(hparams.validation_files,
                               hparams.polyphone_dict_files,
                               hparams.mask_dict_files, hparams)
    else:
        trainset = TextMelLoader(hparams.mel_training_files,
                                 hparams.polyphone_dict_files,
                                 hparams.mask_dict_files, hparams)
        valset = TextMelLoader(hparams.mel_validation_files,
                               hparams.polyphone_dict_files,
                               hparams.mask_dict_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams.num_classes)

    if hparams.distributed_run:  ##False
        train_sampler = DistributedSampler(trainset)
        ## 在多机多卡情况下分布式训练数据的读取,不同的卡读到的数据应该是不同的,利用sampler确保dataloader只会load到整个数据集的一个特定子集
        ## 它为每个子进程划分出一部分数据集,以避免不同进程之间的数据重复。
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    ## 定义一个可迭代的数据加载器
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    ## dataset(Dataset类,决定数据从哪里读取及如何读取)  batch_size(每个batch的大小,批大小)  shuffle(是否进行shuffle操作,每个epoch是否乱序)
    ## num_workers(加载数据时使用几个子进程)  drop_last(当样本数不能被batchsize整除时,是否舍弃最后一批数据)
    return train_loader, valset, collate_fn
Beispiel #24
0
def prepare_dataloaders(experiment, hparams, requires_durations):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader("train", experiment, hparams, requires_durations)
    valset = TextMelLoader("valid", experiment, hparams, requires_durations)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, trainset, valset, collate_fn
Beispiel #25
0
def main():
  hps = utils.get_hparams()
  logger = utils.get_logger(hps.model_dir)
  logger.info(hps)
  utils.check_git_hash(hps.model_dir)

  torch.manual_seed(hps.train.seed)

  train_dataset = TextMelLoader(hps.data.training_files, hps.data)
  collate_fn = TextMelCollate(1)
  train_loader = DataLoader(train_dataset, num_workers=8, shuffle=True,
      batch_size=hps.train.batch_size, pin_memory=True,
      drop_last=True, collate_fn=collate_fn)

  generator = FlowGenerator_DDI(
      len(symbols), 
      out_channels=hps.data.n_mel_channels,
      **hps.model).cuda()
  optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
   
  generator.train()
  for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader):
    x, x_lengths = x.cuda(), x_lengths.cuda()
    y, y_lengths = y.cuda(), y_lengths.cuda()

    _ = generator(x, x_lengths, y, y_lengths, gen=False)
    break

  # check for pretrained and load it without a an optimizer
  pretrained_checkpoint_path = os.path.join(hps.model_dir, "pretrained.pth")
  if os.path.isfile(pretrained_checkpoint_path):
    logger.info("Loading pretrained checkpoint: %s" % pretrained_checkpoint_path)
    model, optimizer, learning_rate, iteration = utils.load_checkpoint(pretrained_checkpoint_path, generator)
    utils.save_checkpoint(model, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
  else:
    utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
Beispiel #26
0
hparams = create_hparams()
hparams.batch_size = 1
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                    hparams.win_length, hparams.n_mel_channels,
                    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax)
speaker = "nes"
checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000'
model = initiate_model(hparams).cuda().eval()
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
test_text_path = 'filelists/emotion/neutral2.txt'
test_set = TextMelLoader(test_text_path, hparams)
datacollate = TextMelCollate(1)
dataloader = DataLoader(test_set,
                        num_workers=1,
                        shuffle=False,
                        batch_size=1,
                        pin_memory=False,
                        drop_last=False,
                        collate_fn=datacollate)
speaker_ids = TextMelLoader(hparams.training_files, hparams).speaker_ids
speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda()

pytorch_total_params = sum(p.numel() for p in model.parameters())
print("total_num_params:  {}".format(pytorch_total_params))
waveglow_total_params = sum(p.numel() for p in waveglow.parameters())
print("waveglow_num_params:  {}".format(waveglow_total_params))
for i, batch in enumerate(dataloader):
Beispiel #27
0
def load_dataloader(hparams, audio_path):
    dataloader = TextMelLoader(audio_path, hparams)
    datacollate = TextMelCollate(1)
    return dataloader, datacollate
Beispiel #28
0
def infer(output_directory, checkpoint_path, warm_start, hparams, debug=False):
    """Inference with teaching force

    Params
    ------
    output_directory (string): directory to the spectrograms
    checkpoint_path(string): checkpoint path
    hparams (object): comma separated list of "name=value" pairs.
    """

    os.makedirs(output_directory, exist_ok=True)
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return_file_name = True

    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             return_file_name=return_file_name)
    collate_fn = TextMelCollate(hparams.n_frames_per_step,
                                return_file_name=return_file_name)

    train_sampler = None

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              collate_fn=collate_fn)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.eval()

    for i, batch in enumerate(train_loader):
        x, y = model.parse_batch(batch[:][:-1])
        files_name = batch[:][-1]
        mel_outputs, mel_outputs_postnet, _, alignments = model(x)

        _, _, mel_expected_padded, _, mel_lengths = x

        for idx in range(mel_outputs_postnet.size(0)):

            name = os.path.basename(files_name[idx]).replace(".wav", '')
            mel_padded = mel_outputs_postnet[idx]
            mel_length = mel_lengths[idx]
            mel = mel_padded[:, :mel_length]
            np.save(os.path.join(output_directory, name + '.npy'),
                    mel.detach().cpu().numpy())

            if debug:
                print(
                    "Debug Mode ON: Saving Wave files and Spectrograms Plot in:",
                    output_directory)
                # plot audios
                librosa.output.write_wav(
                    os.path.join(output_directory, name + '.wav'),
                    spec_to_waveform(taco_stft, mel).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                librosa.output.write_wav(
                    os.path.join(output_directory, name + '_padded.wav'),
                    spec_to_waveform(taco_stft,
                                     mel_padded).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                librosa.output.write_wav(
                    os.path.join(output_directory,
                                 name + '_expected_padded.wav'),
                    spec_to_waveform(
                        taco_stft,
                        mel_expected_padded[idx]).detach().cpu().numpy(),
                    sr=hparams.sampling_rate)
                # plot figures
                plot_spectrogram(mel.detach().cpu().numpy(), )
                plot_spectrogram(
                    mel_padded.detach().cpu().numpy(),
                    os.path.join(output_directory, name + '_padded.png'))
                plot_spectrogram(
                    mel_expected_padded[idx].detach().cpu().numpy(),
                    os.path.join(output_directory,
                                 name + '_expect_padded.png'))
Beispiel #29
0
def train(num_gpus, rank, group_name, output_directory, log_directory,
          checkpoint_path, hparams):
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        init_distributed(rank, num_gpus, group_name, **dist_config)
    #=====END:   ADDED FOR DISTRIBUTED======

    criterion = WaveGlowLoss(hparams.sigma)
    model = WaveGlow(hparams).cuda()

    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    #=====START: ADDED FOR DISTRIBUTED======
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)
    #=====END:   ADDED FOR DISTRIBUTED======

    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # Load checkpoint if one exists
    iteration = 0
    if checkpoint_path:
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer)
        iteration += 1  # next iteration is iteration + 1

    trainset = TextMelLoader(hparams.training_files, hparams)
    collate_fn = TextMelCollate()
    # =====START: ADDED FOR DISTRIBUTED======
    train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None
    # =====END:   ADDED FOR DISTRIBUTED======
    batch_size = hparams.batch_size
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)

    # Get shared output_directory readya

    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    if hparams.with_tensorboard and rank == 0:
        logger = prepare_directories_and_logger(output_directory,
                                                log_directory)

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    print("Total Epochs: {}".format(hparams.epochs))
    print("Batch Size: {}".format(hparams.batch_size))
    print("learning rate: {}".format(hparams.learning_rate))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            model.zero_grad()

            text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
                batch)
            with torch.no_grad():
                enc_outputs, alignments = Taco2(
                    (text_padded, input_lengths, mel_padded, max_len,
                     output_lengths))

            # mel_padded = mel_padded.transpose(1, 2)
            # mel_padded = mel_padded / torch.abs(mel_padded).max().item()
            mel_pos = torch.arange(1000)
            mel_pos = to_gpu(mel_pos).long().unsqueeze(0)
            mel_pos = mel_pos.expand(hparams.batch_size, -1)
            src_pos = torch.arange(hparams.n_position)
            src_pos = to_gpu(src_pos).long().unsqueeze(0)
            src_pos = src_pos.expand(hparams.batch_size, -1)

            mel_padded = (mel_padded + 5) / 10

            z, log_s_list, log_det_w_list, dec_enc_attn = model(
                mel_padded, enc_outputs, mel_pos, src_pos, input_lengths)
            outputs = (z, log_s_list, log_det_w_list, dec_enc_attn)
            loss = criterion(outputs, alignments)
            if num_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, num_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            grad_norm = torch.nn.utils.clip_grad_norm_(
                model.parameters(), hparams.grad_clip_thresh)
            optimizer.step()

            print("{}:\t{:.9f}".format(iteration, reduced_loss))
            if hparams.with_tensorboard and rank == 0:
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    iteration)

            if (iteration % hparams.iters_per_checkpoint == 0):
                if rank == 0:
                    mel_predict, test_attn = model.test(
                        mel_padded, enc_outputs, mel_pos, src_pos,
                        input_lengths)
                    logger.log_alignment(model, dec_enc_attn, alignments,
                                         mel_padded, mel_predict, test_attn,
                                         iteration)
                    checkpoint_path = "{}/waveglow_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #30
0
def prepare_dataloaders(hparams):
    if not hparams.episodic_training:
        # Get data, data loaders and collate function ready
        trainset = TextMelLoader(hparams.training_files, hparams)
        valset = TextMelLoader(hparams.validation_files,
                               hparams,
                               speaker_ids=trainset.speaker_ids)
        collate_fn = TextMelCollate(hparams.n_frames_per_step)

        if hparams.distributed_run:
            train_sampler = DistributedSampler(trainset)
            val_sampler = DistributedSampler(valset)
            #val_sampler = None
            shuffle = False
        else:
            train_sampler = None
            val_sampler = None
            shuffle = True

        train_loader = DataLoader(trainset,
                                  num_workers=2,
                                  shuffle=shuffle,
                                  sampler=train_sampler,
                                  batch_size=hparams.batch_size,
                                  pin_memory=False,
                                  drop_last=True,
                                  collate_fn=collate_fn)
        val_loader = DataLoader(valset,
                                sampler=val_sampler,
                                num_workers=1,
                                shuffle=False,
                                batch_size=hparams.batch_size,
                                pin_memory=False,
                                collate_fn=collate_fn)
    else:
        trainset = EpisodicLoader(hparams.training_files, hparams)
        valset = EpisodicLoader(hparams.validation_files, hparams)
        collate_fn = EpisodicCollater(hparams.n_frames_per_step, hparams)
        #        if hparams.distributed_run:
        #            train_sampler = DistributedSampler(trainset)
        #            shuffle = False
        #        else:
        #            train_sampler = None
        #            shuffle = True

        #        train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
        #                sampler=None, batch_size=hparams.batch_size, pin_memory=False,
        #                drop_last=True, collate_fn=collate_fn)

        if hparams.distributed_run:
            train_sampler = DistributedEpisodicSampler(trainset.sid_to_index,
                                                       hparams,
                                                       shuffle=True)
            val_sampler = DistributedEpisodicSampler(valset.sid_to_index,
                                                     hparams,
                                                     shuffle=False)
        else:
            train_sampler = EpisodicBatchSampler(trainset.sid_to_index,
                                                 hparams,
                                                 shuffle=True)
            val_sampler = EpisodicBatchSampler(valset.sid_to_index,
                                               hparams,
                                               shuffle=False)

        train_loader = DataLoader(trainset,
                                  num_workers=1,
                                  batch_sampler=train_sampler,
                                  pin_memory=False,
                                  collate_fn=collate_fn)
        val_loader = DataLoader(valset,
                                num_workers=1,
                                batch_sampler=val_sampler,
                                pin_memory=False,
                                collate_fn=collate_fn)

    #return train_loader, valset, collate_fn, train_sampler
    return train_loader, train_sampler, val_loader, val_sampler