Example #1
0
def prepare_dataloaders(input_directory, hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(os.path.join(input_directory, 'train.txt'),
                             hparams,
                             mode=hparams.train_mode)
    valset = TextMelLoader(os.path.join(input_directory, 'validation.txt'),
                           hparams,
                           speaker_ids=trainset.speaker_ids,
                           mode=hparams.train_mode)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=hparams.dataloader_num_workers,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Example #2
0
def prepare_dataloaders(hparams, output_directory):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             output_directory=output_directory)
    valset = TextMelLoader(hparams.validation_files,
                           hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Example #3
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    valset = TextMelLoader(hparams.validation_files,
                           hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(
        trainset,
        num_workers=num_workers_,
        shuffle=shuffle,
        sampler=train_sampler,
        batch_size=hparams.batch_size,
        pin_memory=
        False,  # default pin_memory=False, True should allow async memory transfers # Causes very random CUDA errors (after like 4+ hours)
        drop_last=True,
        collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Example #4
0
def prepare_dataloaders(hparams, saved_lookup):
    # Get data, data loaders and collate function ready
    speaker_ids = saved_lookup if hparams.use_saved_speakers else None
    trainset = TextMelLoader(hparams.training_files,
                             hparams,
                             check_files=hparams.check_files,
                             shuffle=False,
                             speaker_ids=speaker_ids)
    valset = TextMelLoader(hparams.validation_files,
                           hparams,
                           check_files=hparams.check_files,
                           shuffle=False,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate()

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=False)  #True)
        shuffle = False
    else:
        train_sampler = None
        shuffle = False  #True

    train_loader = DataLoader(trainset,
                              num_workers=num_workers_,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler, trainset
Example #5
0
def prepare_dataloaders(hparams, epoch=0, valset=None, collate_fn=None):
    # Get data, data loaders and collate function ready

    # prepare train set
    print('preparing train set for epoch {}'.format(epoch))
    shuffle_train = {'shuffle-audiopath': hparams.shuffle_audiopaths,
        'shuffle-batch': hparams.shuffle_batches, 'permute-opt': hparams.permute_opt,
        'pre-batching': hparams.pre_batching}
    trainset = TextMelLoader(hparams.training_files, shuffle_train,
                             hparams, epoch)
    #print('\n'.join(['{}, {}'.format(line[0],line[2]) for line in \
    #                 trainset.audiopaths_and_text[:5]]))
    if valset is None:
        # prepare val set (different shuffle plan compared with train set)
        print('preparing val set for epoch {}'.format(epoch))
        shuffle_val = {'shuffle-audiopath': hparams.shuffle_audiopaths,
            'shuffle-batch': False, 'permute-opt': 'rand', 'pre-batching': False}
        valset = TextMelLoader(hparams.validation_files, shuffle_val, hparams)
    if collate_fn is None:
        collate_fn = {'train': TextMelCollate(hparams, pre_batching=hparams.pre_batching),
                      'val': TextMelCollate(hparams, pre_batching=False)}

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=hparams.shuffle_samples)
    else:
        train_sampler = None

    shuffle = (train_sampler is None) and hparams.shuffle_samples
    batch_size = 1 if hparams.pre_batching else hparams.batch_size
    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
        sampler=train_sampler, batch_size=batch_size, pin_memory=False,
        drop_last=True, collate_fn=collate_fn['train'])
    return train_loader, valset, collate_fn
Example #6
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    warp_trainset = TextMelLoader(hparams.training_files,
                                  hparams,
                                  warp_set=True)

    valset = TextMelLoader(hparams.validation_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None

    train_loader = DataLoader(torch.utils.data.ConcatDataset(
        [trainset, warp_trainset]),
                              num_workers=1,
                              shuffle=True,
                              sampler=None,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    # train_loader = DataLoader(trainset, num_workers=1, shuffle=True,
    #                           sampler=train_sampler,
    #                           batch_size=hparams.batch_size, pin_memory=False,
    #                           drop_last=True, collate_fn=collate_fn)

    return train_loader, valset, collate_fn
Example #7
0
def prepare_single_dataloaders(hparams, output_directory):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(
        'filelists/grapheme/grapheme_selvas_main_train.txt',
        hparams,
        output_directory=output_directory)
    # debugging purpose
    # trainset = TextMelLoader('filelists/selvas_main_valid.txt', hparams, output_directory=output_directory)
    valset = TextMelLoader('filelists/grapheme/main_valid_and_test.txt',
                           hparams,
                           speaker_ids=trainset.speaker_ids)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, train_sampler
Example #8
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files,
                             hparams.polyphone_dict_files,
                             hparams.mask_dict_files, hparams)
    valset = TextMelLoader(hparams.validation_files,
                           hparams.polyphone_dict_files,
                           hparams.mask_dict_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step,
                                hparams.n_pinyin_symbols)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Example #9
0
def train_and_eval(rank, n_gpus, hps):
  global global_step
  if rank == 0:
    logger = utils.get_logger(hps.model_dir)
    logger.info(hps)
    utils.check_git_hash(hps.model_dir)
    writer = SummaryWriter(log_dir=hps.model_dir)
    writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))

  dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
  torch.manual_seed(hps.train.seed)
  torch.cuda.set_device(rank)

  train_dataset = TextMelLoader(hps.data.training_files, hps.data)
  train_sampler = torch.utils.data.distributed.DistributedSampler(
      train_dataset,
      num_replicas=n_gpus,
      rank=rank,
      shuffle=True)
  collate_fn = TextMelCollate(1)
  train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False,
      batch_size=hps.train.batch_size, pin_memory=True,
      drop_last=True, collate_fn=collate_fn, sampler=train_sampler)
  if rank == 0:
    val_dataset = TextMelLoader(hps.data.validation_files, hps.data)
    val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False,
        batch_size=hps.train.batch_size, pin_memory=True,
        drop_last=True, collate_fn=collate_fn)

  generator = models.FlowGenerator(
      n_vocab=len(symbols), 
      out_channels=hps.data.n_mel_channels, 
      **hps.model).cuda(rank)
  optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
  if hps.train.fp16_run:
    generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1")
  generator = DDP(generator)
  epoch_str = 1
  global_step = 0
  try:
    _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g)
    epoch_str += 1
    optimizer_g.step_num = (epoch_str - 1) * len(train_loader)
    optimizer_g._update_learning_rate()
    global_step = (epoch_str - 1) * len(train_loader)
  except:
    if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")):
      _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g)
  
  for epoch in range(epoch_str, hps.train.epochs + 1):
    if rank==0:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer)
      evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval)
      if epoch%50 == 0:
        utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch)))
    else:
      train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
Example #10
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams['training_files'], hparams)
    valset = TextMelLoader(hparams['validation_files'], hparams)
    collate_fn = TextMelCollate(hparams['n_frames_per_step'])

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=True,
                              sampler=None,
                              batch_size=hparams['batch_size'],
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Example #11
0
def process_one(index, skip_existing=False):
    global text_mel_loader
    global metadata_path
    global output_dir
    if text_mel_loader is None:
        text_mel_loader = TextMelLoader(metadata_path,
                                        hparams=hp,
                                        mode='preprocess')
        fpath = output_dir.joinpath('speaker_ids.json')
        speaker_ids = text_mel_loader.speaker_ids
        json.dump(speaker_ids,
                  open(fpath, 'wt', encoding='utf8'),
                  indent=4,
                  ensure_ascii=False)

    onedir = output_dir.joinpath('npy', format_index(index))
    onedir.mkdir(exist_ok=True, parents=True)
    tpath = onedir.joinpath("text.npy")
    mpath = onedir.joinpath("mel.npy")
    spath = onedir.joinpath("speaker.npy")
    fpath = onedir.joinpath("f0.npy")

    if skip_existing and all(
        [f.is_file() for f in [tpath, mpath, spath, fpath]]):
        return

    text, mel, speaker_id, f0 = text_mel_loader[index]

    np.save(tpath, text.numpy(), allow_pickle=False)
    np.save(mpath, mel.numpy(), allow_pickle=False)
    np.save(spath, speaker_id.numpy(), allow_pickle=False)
    np.save(fpath, f0.numpy(), allow_pickle=False)
    return index
Example #12
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams)
    # valset = TextMelLoader(hparams.validation_files, hparams)
    valset = None
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    # train_sampler = DistributedSampler(trainset) \
    #     if hparams.distributed_run else None
    #
    # train_loader = DataLoader(trainset, num_workers=1, shuffle=False,
    #                           sampler=train_sampler,
    #                           batch_size=hparams.batch_size, pin_memory=False,
    #                           drop_last=True, collate_fn=collate_fn)
    # return train_loader, valset, collate_fn

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Example #13
0
def style_transfer_v2():
    audio_paths_ = 'data/examples_filelist_v2.txt'
    dataloader_ = TextMelLoader(audio_paths_, hparams)
    datacollate_ = TextMelCollate(1)
    ## Load data
    # for file_idx in range(10):
    #     audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    #     print(dict(file_idx=file_idx, audio_path=audio_path, text=text))

    file_idx = 8
    audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx]
    print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid))

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    pitch_contour = dataloader_[file_idx][3][None].cuda()
    mel = load_mel(audio_path)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]]))
    ipd.Audio(audio_path, rate=hparams.sampling_rate)

    # Style Transfer (Rhythm and Pitch Contour)
    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)
    speaker_id = next(female_speakers) if np.random.randint(2) else next(
        male_speakers)
    speaker_id = torch.LongTensor([speaker_id]).cuda()

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour, rhythm))

    plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                          mel_outputs_postnet.data.cpu().numpy()[0],
                          pitch_contour.data.cpu().numpy()[0, 0],
                          rhythm.data.cpu().numpy()[:, 0].T)
    plt.show()

    out_mel = mel_outputs_postnet.data.cpu().numpy()[0]
    t0 = time.time()
    # wav = aukit.inv_mel_spectrogram()
    out_wav = infer_waveform_melgan(out_mel)
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)

    t0 = time.time()
    with torch.no_grad():
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8),
                         0.01)[:, 0]
    ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)
    out_wav = audio[0].data.cpu().numpy()
    print(time.time() - t0)
    aukit.play_audio(out_wav, sr=22050)
Example #14
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_lst, hparams)
    valset = TextMelLoader(hparams.validation_lst, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Example #15
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(
        hparams.training_files, hparams
    )  # trainset.__getitem__(index) = (text, mel), text in [num_char], mel in [num_mel, ceil((len(audio)+1)/hop_length)]
    valset = TextMelLoader(hparams.validation_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)  #
    train_sampler = DistributedSampler(trainset) \
        if hparams.distributed_run else None
    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn, trainset
Example #16
0
def load_dataloader(hparams, audio_path):
    if not hparams.episodic_training:
        dataloader = TextMelLoader(audio_path, hparams)
        datacollate = TextMelCollate(1)
    else:
        dataloader = EpisodicLoader(audio_path, hparams)
        datacollate = EpisodicCollater(1, hparams)
    
    return dataloader, datacollate
Example #17
0
def synthesize2(model,
                audio_path,
                text,
                source_speaker_id,
                target_speaker_id,
                outname="sample.wav"):
    tacotron, waveglow, denoiser = model
    with open('temp.txt', 'w') as f:
        f.write(f"{audio_path}|{text}|{source_speaker_id}")
    arpabet_dict = cmudict.CMUDict('mellotron/data/cmu_dictionary')
    hparams = create_hparams()
    dataloader = TextMelLoader("temp.txt", hparams)
    datacollate = TextMelCollate(1)

    file_idx = 0
    audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cpu()
    pitch_contour = dataloader[file_idx][3][None].cpu()
    mel = load_mel(audio_path)
    print(audio_path, text)

    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]]))

    # For changing the pitch
    pitch_contour2 = pitch_contour.data.cpu().numpy().copy()
    #pitch_contour2[pitch_contour2 > 0] -= 45.
    #pitch_contour2[pitch_contour2 > 0] = 150.
    pitch_contour2 = torch.Tensor(pitch_contour2).cpu()

    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)

    speaker_id = torch.LongTensor([target_speaker_id]).cpu()

    sampling_rate = 22050

    with torch.no_grad():
        mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention(
            (text_encoded, mel, speaker_id, pitch_contour2, rhythm))
        audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.66),
                         0.03)[0, 0]
        audio = audio.cpu().numpy()
        pan = 0
        audio = panner(audio, pan)
        write(outname, sampling_rate, audio)
    os.remove("temp.txt")
Example #18
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader('train', hparams)
    valset = TextMelLoader('val', hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
    else:
        l = trainset.get_lengths()
        train_sampler = PartialyRandomizedSimilarTimeLengthSampler(
            l, batch_size=hparams.batch_size * (sum(l) / len(l)))

    train_loader = DataLoader(trainset, num_workers=24,
                              shuffle=False,
                              sampler=train_sampler,
                              batch_size=1,
                              pin_memory=False,
                              drop_last=False,
                              collate_fn=collate_fn)
    return train_loader, valset, collate_fn
Example #19
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams, warp_set="og")
    trainset_aug_time = TextMelLoader(hparams.training_files, hparams, warp_set="time")
    trainset_aug_freq = TextMelLoader(hparams.training_files, hparams, warp_set="freq")

    valset = TextMelLoader(hparams.validation_files, hparams, warp_set="og")
    valset_aug_time = TextMelLoader(hparams.validation_files, hparams, warp_set="time")
    valset_aug_freq = TextMelLoader(hparams.validation_files, hparams, warp_set="freq")

    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    train_aug_set = torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq])
    train_sampler = DistributedSampler(train_aug_set) if hparams.distributed_run else None

    if hparams.distributed_run:
        train_loader = DataLoader(train_aug_set,
                                  num_workers=35, shuffle=False,
                                  sampler=train_sampler,
                                  batch_size=hparams.batch_size, pin_memory=False,
                                  drop_last=True, collate_fn=collate_fn)
    else:
        train_loader = DataLoader(torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq]), num_workers=35, shuffle=True,
                                  sampler=None,
                                  batch_size=hparams.batch_size, pin_memory=False,
                                  drop_last=True, collate_fn=collate_fn)

    del trainset, trainset_aug_time, trainset_aug_freq

    return train_loader, valset, valset_aug_time, valset_aug_freq, collate_fn
Example #20
0
def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    ## if not 用法:   if true才执行  (即 if not false)
    if not hparams.load_mel_from_disk:
        trainset = TextMelLoader(hparams.training_files,
                                 hparams.polyphone_dict_files,
                                 hparams.mask_dict_files, hparams)
        valset = TextMelLoader(hparams.validation_files,
                               hparams.polyphone_dict_files,
                               hparams.mask_dict_files, hparams)
    else:
        trainset = TextMelLoader(hparams.mel_training_files,
                                 hparams.polyphone_dict_files,
                                 hparams.mask_dict_files, hparams)
        valset = TextMelLoader(hparams.mel_validation_files,
                               hparams.polyphone_dict_files,
                               hparams.mask_dict_files, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams.num_classes)

    if hparams.distributed_run:  ##False
        train_sampler = DistributedSampler(trainset)
        ## 在多机多卡情况下分布式训练数据的读取,不同的卡读到的数据应该是不同的,利用sampler确保dataloader只会load到整个数据集的一个特定子集
        ## 它为每个子进程划分出一部分数据集,以避免不同进程之间的数据重复。
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    ## 定义一个可迭代的数据加载器
    train_loader = DataLoader(trainset,
                              num_workers=0,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    ## dataset(Dataset类,决定数据从哪里读取及如何读取)  batch_size(每个batch的大小,批大小)  shuffle(是否进行shuffle操作,每个epoch是否乱序)
    ## num_workers(加载数据时使用几个子进程)  drop_last(当样本数不能被batchsize整除时,是否舍弃最后一批数据)
    return train_loader, valset, collate_fn
Example #21
0
def mel2audio(mel):
    hp = create_hparams()
    mel_loader = TextMelLoader(hp.training_files, hp)
    ## denormlize
    mel_spec = mel_loader.stft.spectral_de_normalize(mel)

    inv_basis = torch.tensor(np.linalg.pinv(mel_loader.stft.mel_basis))
    mag = torch.matmul(inv_basis, mel_spec)

    signal = griffin_lim(mag.unsqueeze(0), mel_loader.stft.stft_fn,
                         n_iters=60).squeeze(0).numpy()
    print(signal.shape)
    return signal
Example #22
0
def prepare_dataloaders(experiment, hparams, requires_durations):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader("train", experiment, hparams, requires_durations)
    valset = TextMelLoader("valid", experiment, hparams, requires_durations)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset,
                              num_workers=1,
                              shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size,
                              pin_memory=False,
                              drop_last=True,
                              collate_fn=collate_fn)
    return train_loader, trainset, valset, collate_fn
Example #23
0
def prepare_dataloaders(hparams, audio_offset=0):
    # Get data, data loaders and collate function ready
    trainset = TextMelLoader(hparams.training_files, hparams, TBPTT=False, check_files=False, verbose=True, audio_offset=audio_offset)
    collate_fn = TextMelCollate(hparams)
    
    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset, shuffle=False)
    else:
        train_sampler = None
    
    train_loader = DataLoader(trainset, num_workers=hparams.num_workers, shuffle=False,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size, pin_memory=False, # default pin_memory=False, True should allow async memory transfers # Causes very random CUDA errors (after like 4+ hours)
                              drop_last=True, collate_fn=collate_fn)
    return train_loader, None, collate_fn, train_sampler, trainset
Example #24
0
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16,
         denoiser_strength):
    hparams = create_hparams()
    Taco2 = load_pretrained_taco('tacotron2.pt', hparams)

    testset = TextMelLoader(text_files, hparams)
    collate_fn = TextMelCollate()

    test_loader = DataLoader(testset,
                             num_workers=0,
                             shuffle=False,
                             sampler=None,
                             batch_size=1,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=collate_fn)
    waveglow = torch.load(waveglow_path)['model']
    # waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow.cuda().eval()
    if is_fp16:
        from apex import amp
        waveglow, _ = amp.initialize(waveglow, [], opt_level="O3")

    if denoiser_strength > 0:
        denoiser = Denoiser(waveglow).cuda()

    for i, batch in enumerate(test_loader):
        text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch(
            batch)
        enc_outputs, _ = Taco2(
            (text_padded, input_lengths, mel_padded, max_len, output_lengths))
        # mel = torch.autograd.Variable(mel.cuda())
        # mel = torch.unsqueeze(mel, 0)
        # mel = mel.half() if is_fp16 else mel
        with torch.no_grad():
            mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma)
            '''if denoiser_strength > 0:
                audio = denoiser(audio, denoiser_strength)
            audio = audio * MAX_WAV_VALUE'''
        # audio = audio.squeeze()
        # mel = mel.cpu().numpy()
        # audio = audio.astype('int16')
        print(mel)
        mel = mel.squeeze()
        print(mel.size())
        mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i))
        torch.save(mel, mel_path)
        print(mel_path)
Example #25
0
def process_one(index):
    global text_mel_loader
    global metadata_path
    global output_dir
    if text_mel_loader is None:
        text_mel_loader = TextMelLoader(metadata_path, hparams=hp, mode='preprocess')
    text, mel, speaker_id, f0 = text_mel_loader[index]
    onedir = output_dir.joinpath(format_index(index))
    onedir.mkdir(exist_ok=True)
    tpath = onedir.joinpath("text.npy")
    mpath = onedir.joinpath("mel.npy")
    spath = onedir.joinpath("speaker.npy")
    fpath = onedir.joinpath("f0.npy")
    np.save(tpath, text.numpy(), allow_pickle=False)
    np.save(mpath, mel.numpy(), allow_pickle=False)
    np.save(spath, speaker_id.numpy(), allow_pickle=False)
    np.save(fpath, f0.numpy(), allow_pickle=False)
    return index
Example #26
0
def main():
    hps = utils.get_hparams()
    logger = utils.get_logger(hps.model_dir)
    logger.info(hps)
    utils.check_git_hash(hps.model_dir)

    torch.manual_seed(hps.train.seed)

    train_dataset = TextMelLoader(hps.data.training_files, hps.data)
    collate_fn = TextMelCollate(1)
    train_loader = DataLoader(train_dataset,
                              num_workers=8,
                              shuffle=True,
                              batch_size=hps.train.batch_size,
                              pin_memory=True,
                              drop_last=True,
                              collate_fn=collate_fn)

    generator = FlowGenerator_DDI(speaker_dim=hps.model.speaker_embedding,
                                  n_vocab=len(symbols),
                                  out_channels=hps.data.n_mel_channels,
                                  **hps.model).cuda()
    optimizer_g = commons.Adam(generator.parameters(),
                               scheduler=hps.train.scheduler,
                               dim_model=hps.model.hidden_channels,
                               warmup_steps=hps.train.warmup_steps,
                               lr=hps.train.learning_rate,
                               betas=hps.train.betas,
                               eps=hps.train.eps)

    generator.train()
    for batch_idx, (x, x_lengths, y, y_lengths,
                    speaker_embedding) in enumerate(train_loader):
        x, x_lengths = x.cuda(), x_lengths.cuda()
        y, y_lengths = y.cuda(), y_lengths.cuda()
        speaker_embedding = speaker_embedding.cuda()

        _ = generator(x, x_lengths, speaker_embedding, y, y_lengths, gen=False)
        break

    utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0,
                          os.path.join(hps.model_dir, "ddi_G.pth"))
def prepare_speaker_set(hparams):
    # Define Speakers Set
    speaker_ids = TextMelLoader(
        "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist_22k_skipped.txt",
        hparams).speaker_ids
    speakers = pd.read_csv('filelists/libritts_speakerinfo.txt',
                           engine='python',
                           header=None,
                           comment=';',
                           sep=' *\| *',
                           names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME'])
    speakers['MELLOTRON_ID'] = speakers['ID'].apply(
        lambda x: speaker_ids[x] if x in speaker_ids else -1)
    female_speakers = cycle(
        speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())
    male_speakers = cycle(
        speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0")
        ['MELLOTRON_ID'].sample(frac=1).tolist())
    return female_speakers, male_speakers
Example #28
0
def main():
  hps = utils.get_hparams()
  logger = utils.get_logger(hps.model_dir)
  logger.info(hps)
  utils.check_git_hash(hps.model_dir)

  torch.manual_seed(hps.train.seed)

  train_dataset = TextMelLoader(hps.data.training_files, hps.data)
  collate_fn = TextMelCollate(1)
  train_loader = DataLoader(train_dataset, num_workers=8, shuffle=True,
      batch_size=hps.train.batch_size, pin_memory=True,
      drop_last=True, collate_fn=collate_fn)

  generator = FlowGenerator_DDI(
      len(symbols), 
      out_channels=hps.data.n_mel_channels,
      **hps.model).cuda()
  optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps)
   
  generator.train()
  for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader):
    x, x_lengths = x.cuda(), x_lengths.cuda()
    y, y_lengths = y.cuda(), y_lengths.cuda()

    _ = generator(x, x_lengths, y, y_lengths, gen=False)
    break

  # check for pretrained and load it without a an optimizer
  pretrained_checkpoint_path = os.path.join(hps.model_dir, "pretrained.pth")
  if os.path.isfile(pretrained_checkpoint_path):
    logger.info("Loading pretrained checkpoint: %s" % pretrained_checkpoint_path)
    model, optimizer, learning_rate, iteration = utils.load_checkpoint(pretrained_checkpoint_path, generator)
    utils.save_checkpoint(model, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
  else:
    utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
Example #29
0
hparams = create_hparams()
hparams.batch_size = 1
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                    hparams.win_length, hparams.n_mel_channels,
                    hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax)
speaker = "nes"
checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000'
model = initiate_model(hparams).cuda().eval()
model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
test_text_path = 'filelists/emotion/neutral2.txt'
test_set = TextMelLoader(test_text_path, hparams)
datacollate = TextMelCollate(1)
dataloader = DataLoader(test_set,
                        num_workers=1,
                        shuffle=False,
                        batch_size=1,
                        pin_memory=False,
                        drop_last=False,
                        collate_fn=datacollate)
speaker_ids = TextMelLoader(hparams.training_files, hparams).speaker_ids
speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda()

pytorch_total_params = sum(p.numel() for p in model.parameters())
print("total_num_params:  {}".format(pytorch_total_params))
waveglow_total_params = sum(p.numel() for p in waveglow.parameters())
print("waveglow_num_params:  {}".format(waveglow_total_params))
Example #30
0
        "--skip_existing",
        type=bool,
        default=True,
        help="Whether to overwrite existing files with the same name. ")
    parser.add_argument(
        "--hparams",
        type=str,
        default="",
        help=
        "Hyperparameter overrides as a comma-separated list of name-value pairs"
    )
    args = parser.parse_args()

    metadata_path = args.metadata_path
    text_mel_loader = TextMelLoader(metadata_path,
                                    hparams=hp,
                                    mode='preprocess')

    output_dir = args.output_dir
    output_dir.mkdir(exist_ok=True, parents=True)

    fpath = output_dir.joinpath('speaker_ids.json')
    speaker_ids = text_mel_loader.speaker_ids
    json.dump(speaker_ids,
              open(fpath, 'wt', encoding='utf8'),
              indent=4,
              ensure_ascii=False)

    # Preprocess the dataset
    process_many(args.n_processes, skip_existing=args.skip_existing)