def prepare_dataloaders(input_directory, hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(os.path.join(input_directory, 'train.txt'), hparams, mode=hparams.train_mode) valset = TextMelLoader(os.path.join(input_directory, 'validation.txt'), hparams, speaker_ids=trainset.speaker_ids, mode=hparams.train_mode) collate_fn = TextMelCollate(hparams.n_frames_per_step) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=hparams.dataloader_num_workers, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, train_sampler
def prepare_dataloaders(hparams, output_directory): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams, output_directory=output_directory) valset = TextMelLoader(hparams.validation_files, hparams, speaker_ids=trainset.speaker_ids) collate_fn = TextMelCollate(hparams.n_frames_per_step) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, train_sampler
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams) valset = TextMelLoader(hparams.validation_files, hparams, speaker_ids=trainset.speaker_ids) collate_fn = TextMelCollate(hparams.n_frames_per_step) if hparams.distributed_run: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader( trainset, num_workers=num_workers_, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory= False, # default pin_memory=False, True should allow async memory transfers # Causes very random CUDA errors (after like 4+ hours) drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, train_sampler
def prepare_dataloaders(hparams, saved_lookup): # Get data, data loaders and collate function ready speaker_ids = saved_lookup if hparams.use_saved_speakers else None trainset = TextMelLoader(hparams.training_files, hparams, check_files=hparams.check_files, shuffle=False, speaker_ids=speaker_ids) valset = TextMelLoader(hparams.validation_files, hparams, check_files=hparams.check_files, shuffle=False, speaker_ids=trainset.speaker_ids) collate_fn = TextMelCollate() if hparams.distributed_run: train_sampler = DistributedSampler(trainset, shuffle=False) #True) shuffle = False else: train_sampler = None shuffle = False #True train_loader = DataLoader(trainset, num_workers=num_workers_, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, train_sampler, trainset
def prepare_dataloaders(hparams, epoch=0, valset=None, collate_fn=None): # Get data, data loaders and collate function ready # prepare train set print('preparing train set for epoch {}'.format(epoch)) shuffle_train = {'shuffle-audiopath': hparams.shuffle_audiopaths, 'shuffle-batch': hparams.shuffle_batches, 'permute-opt': hparams.permute_opt, 'pre-batching': hparams.pre_batching} trainset = TextMelLoader(hparams.training_files, shuffle_train, hparams, epoch) #print('\n'.join(['{}, {}'.format(line[0],line[2]) for line in \ # trainset.audiopaths_and_text[:5]])) if valset is None: # prepare val set (different shuffle plan compared with train set) print('preparing val set for epoch {}'.format(epoch)) shuffle_val = {'shuffle-audiopath': hparams.shuffle_audiopaths, 'shuffle-batch': False, 'permute-opt': 'rand', 'pre-batching': False} valset = TextMelLoader(hparams.validation_files, shuffle_val, hparams) if collate_fn is None: collate_fn = {'train': TextMelCollate(hparams, pre_batching=hparams.pre_batching), 'val': TextMelCollate(hparams, pre_batching=False)} if hparams.distributed_run: train_sampler = DistributedSampler(trainset, shuffle=hparams.shuffle_samples) else: train_sampler = None shuffle = (train_sampler is None) and hparams.shuffle_samples batch_size = 1 if hparams.pre_batching else hparams.batch_size train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn['train']) return train_loader, valset, collate_fn
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams) warp_trainset = TextMelLoader(hparams.training_files, hparams, warp_set=True) valset = TextMelLoader(hparams.validation_files, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step) train_sampler = DistributedSampler(trainset) \ if hparams.distributed_run else None train_loader = DataLoader(torch.utils.data.ConcatDataset( [trainset, warp_trainset]), num_workers=1, shuffle=True, sampler=None, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # train_loader = DataLoader(trainset, num_workers=1, shuffle=True, # sampler=train_sampler, # batch_size=hparams.batch_size, pin_memory=False, # drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn
def prepare_single_dataloaders(hparams, output_directory): # Get data, data loaders and collate function ready trainset = TextMelLoader( 'filelists/grapheme/grapheme_selvas_main_train.txt', hparams, output_directory=output_directory) # debugging purpose # trainset = TextMelLoader('filelists/selvas_main_valid.txt', hparams, output_directory=output_directory) valset = TextMelLoader('filelists/grapheme/main_valid_and_test.txt', hparams, speaker_ids=trainset.speaker_ids) collate_fn = TextMelCollate(hparams.n_frames_per_step) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, train_sampler
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) valset = TextMelLoader(hparams.validation_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams.n_pinyin_symbols) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=0, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn
def train_and_eval(rank, n_gpus, hps): global global_step if rank == 0: logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) writer = SummaryWriter(log_dir=hps.model_dir) writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank) torch.manual_seed(hps.train.seed) torch.cuda.set_device(rank) train_dataset = TextMelLoader(hps.data.training_files, hps.data) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=n_gpus, rank=rank, shuffle=True) collate_fn = TextMelCollate(1) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn, sampler=train_sampler) if rank == 0: val_dataset = TextMelLoader(hps.data.validation_files, hps.data) val_loader = DataLoader(val_dataset, num_workers=8, shuffle=False, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn) generator = models.FlowGenerator( n_vocab=len(symbols), out_channels=hps.data.n_mel_channels, **hps.model).cuda(rank) optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) if hps.train.fp16_run: generator, optimizer_g._optim = amp.initialize(generator, optimizer_g._optim, opt_level="O1") generator = DDP(generator) epoch_str = 1 global_step = 0 try: _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), generator, optimizer_g) epoch_str += 1 optimizer_g.step_num = (epoch_str - 1) * len(train_loader) optimizer_g._update_learning_rate() global_step = (epoch_str - 1) * len(train_loader) except: if hps.train.ddi and os.path.isfile(os.path.join(hps.model_dir, "ddi_G.pth")): _ = utils.load_checkpoint(os.path.join(hps.model_dir, "ddi_G.pth"), generator, optimizer_g) for epoch in range(epoch_str, hps.train.epochs + 1): if rank==0: train(rank, epoch, hps, generator, optimizer_g, train_loader, logger, writer) evaluate(rank, epoch, hps, generator, optimizer_g, val_loader, logger, writer_eval) if epoch%50 == 0: utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, epoch, os.path.join(hps.model_dir, "G_{}.pth".format(epoch))) else: train(rank, epoch, hps, generator, optimizer_g, train_loader, None, None)
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams['training_files'], hparams) valset = TextMelLoader(hparams['validation_files'], hparams) collate_fn = TextMelCollate(hparams['n_frames_per_step']) train_loader = DataLoader(trainset, num_workers=1, shuffle=True, sampler=None, batch_size=hparams['batch_size'], pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn
def process_one(index, skip_existing=False): global text_mel_loader global metadata_path global output_dir if text_mel_loader is None: text_mel_loader = TextMelLoader(metadata_path, hparams=hp, mode='preprocess') fpath = output_dir.joinpath('speaker_ids.json') speaker_ids = text_mel_loader.speaker_ids json.dump(speaker_ids, open(fpath, 'wt', encoding='utf8'), indent=4, ensure_ascii=False) onedir = output_dir.joinpath('npy', format_index(index)) onedir.mkdir(exist_ok=True, parents=True) tpath = onedir.joinpath("text.npy") mpath = onedir.joinpath("mel.npy") spath = onedir.joinpath("speaker.npy") fpath = onedir.joinpath("f0.npy") if skip_existing and all( [f.is_file() for f in [tpath, mpath, spath, fpath]]): return text, mel, speaker_id, f0 = text_mel_loader[index] np.save(tpath, text.numpy(), allow_pickle=False) np.save(mpath, mel.numpy(), allow_pickle=False) np.save(spath, speaker_id.numpy(), allow_pickle=False) np.save(fpath, f0.numpy(), allow_pickle=False) return index
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams) # valset = TextMelLoader(hparams.validation_files, hparams) valset = None collate_fn = TextMelCollate(hparams.n_frames_per_step) # train_sampler = DistributedSampler(trainset) \ # if hparams.distributed_run else None # # train_loader = DataLoader(trainset, num_workers=1, shuffle=False, # sampler=train_sampler, # batch_size=hparams.batch_size, pin_memory=False, # drop_last=True, collate_fn=collate_fn) # return train_loader, valset, collate_fn if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=0, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn
def style_transfer_v2(): audio_paths_ = 'data/examples_filelist_v2.txt' dataloader_ = TextMelLoader(audio_paths_, hparams) datacollate_ = TextMelCollate(1) ## Load data # for file_idx in range(10): # audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] # print(dict(file_idx=file_idx, audio_path=audio_path, text=text)) file_idx = 8 audio_path, text, sid = dataloader_.audiopaths_and_text[file_idx] print(dict(file_idx=file_idx, audio_path=audio_path, text=text, sid=sid)) # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader_[file_idx][3][None].cuda() mel = load_mel(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate_([dataloader_[file_idx]])) ipd.Audio(audio_path, rate=hparams.sampling_rate) # Style Transfer (Rhythm and Pitch Contour) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm)) plot_mel_f0_alignment(x[2].data.cpu().numpy()[0], mel_outputs_postnet.data.cpu().numpy()[0], pitch_contour.data.cpu().numpy()[0, 0], rhythm.data.cpu().numpy()[:, 0].T) plt.show() out_mel = mel_outputs_postnet.data.cpu().numpy()[0] t0 = time.time() # wav = aukit.inv_mel_spectrogram() out_wav = infer_waveform_melgan(out_mel) print(time.time() - t0) aukit.play_audio(out_wav, sr=22050) t0 = time.time() with torch.no_grad(): audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate) out_wav = audio[0].data.cpu().numpy() print(time.time() - t0) aukit.play_audio(out_wav, sr=22050)
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_lst, hparams) valset = TextMelLoader(hparams.validation_lst, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step) train_sampler = DistributedSampler(trainset) \ if hparams.distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader( hparams.training_files, hparams ) # trainset.__getitem__(index) = (text, mel), text in [num_char], mel in [num_mel, ceil((len(audio)+1)/hop_length)] valset = TextMelLoader(hparams.validation_files, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step) # train_sampler = DistributedSampler(trainset) \ if hparams.distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, valset, collate_fn, trainset
def load_dataloader(hparams, audio_path): if not hparams.episodic_training: dataloader = TextMelLoader(audio_path, hparams) datacollate = TextMelCollate(1) else: dataloader = EpisodicLoader(audio_path, hparams) datacollate = EpisodicCollater(1, hparams) return dataloader, datacollate
def synthesize2(model, audio_path, text, source_speaker_id, target_speaker_id, outname="sample.wav"): tacotron, waveglow, denoiser = model with open('temp.txt', 'w') as f: f.write(f"{audio_path}|{text}|{source_speaker_id}") arpabet_dict = cmudict.CMUDict('mellotron/data/cmu_dictionary') hparams = create_hparams() dataloader = TextMelLoader("temp.txt", hparams) datacollate = TextMelCollate(1) file_idx = 0 audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cpu() pitch_contour = dataloader[file_idx][3][None].cpu() mel = load_mel(audio_path) print(audio_path, text) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = tacotron.parse_batch(datacollate([dataloader[file_idx]])) # For changing the pitch pitch_contour2 = pitch_contour.data.cpu().numpy().copy() #pitch_contour2[pitch_contour2 > 0] -= 45. #pitch_contour2[pitch_contour2 > 0] = 150. pitch_contour2 = torch.Tensor(pitch_contour2).cpu() with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = tacotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = torch.LongTensor([target_speaker_id]).cpu() sampling_rate = 22050 with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = tacotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour2, rhythm)) audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.66), 0.03)[0, 0] audio = audio.cpu().numpy() pan = 0 audio = panner(audio, pan) write(outname, sampling_rate, audio) os.remove("temp.txt")
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader('train', hparams) valset = TextMelLoader('val', hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) else: l = trainset.get_lengths() train_sampler = PartialyRandomizedSimilarTimeLengthSampler( l, batch_size=hparams.batch_size * (sum(l) / len(l))) train_loader = DataLoader(trainset, num_workers=24, shuffle=False, sampler=train_sampler, batch_size=1, pin_memory=False, drop_last=False, collate_fn=collate_fn) return train_loader, valset, collate_fn
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams, warp_set="og") trainset_aug_time = TextMelLoader(hparams.training_files, hparams, warp_set="time") trainset_aug_freq = TextMelLoader(hparams.training_files, hparams, warp_set="freq") valset = TextMelLoader(hparams.validation_files, hparams, warp_set="og") valset_aug_time = TextMelLoader(hparams.validation_files, hparams, warp_set="time") valset_aug_freq = TextMelLoader(hparams.validation_files, hparams, warp_set="freq") collate_fn = TextMelCollate(hparams.n_frames_per_step) train_aug_set = torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq]) train_sampler = DistributedSampler(train_aug_set) if hparams.distributed_run else None if hparams.distributed_run: train_loader = DataLoader(train_aug_set, num_workers=35, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) else: train_loader = DataLoader(torch.utils.data.ConcatDataset([trainset, trainset_aug_time, trainset_aug_freq]), num_workers=35, shuffle=True, sampler=None, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) del trainset, trainset_aug_time, trainset_aug_freq return train_loader, valset, valset_aug_time, valset_aug_freq, collate_fn
def prepare_dataloaders(hparams): # Get data, data loaders and collate function ready ## if not 用法: if true才执行 (即 if not false) if not hparams.load_mel_from_disk: trainset = TextMelLoader(hparams.training_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) valset = TextMelLoader(hparams.validation_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) else: trainset = TextMelLoader(hparams.mel_training_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) valset = TextMelLoader(hparams.mel_validation_files, hparams.polyphone_dict_files, hparams.mask_dict_files, hparams) collate_fn = TextMelCollate(hparams.n_frames_per_step, hparams.num_classes) if hparams.distributed_run: ##False train_sampler = DistributedSampler(trainset) ## 在多机多卡情况下分布式训练数据的读取,不同的卡读到的数据应该是不同的,利用sampler确保dataloader只会load到整个数据集的一个特定子集 ## 它为每个子进程划分出一部分数据集,以避免不同进程之间的数据重复。 shuffle = False else: train_sampler = None shuffle = True ## 定义一个可迭代的数据加载器 train_loader = DataLoader(trainset, num_workers=0, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) ## dataset(Dataset类,决定数据从哪里读取及如何读取) batch_size(每个batch的大小,批大小) shuffle(是否进行shuffle操作,每个epoch是否乱序) ## num_workers(加载数据时使用几个子进程) drop_last(当样本数不能被batchsize整除时,是否舍弃最后一批数据) return train_loader, valset, collate_fn
def mel2audio(mel): hp = create_hparams() mel_loader = TextMelLoader(hp.training_files, hp) ## denormlize mel_spec = mel_loader.stft.spectral_de_normalize(mel) inv_basis = torch.tensor(np.linalg.pinv(mel_loader.stft.mel_basis)) mag = torch.matmul(inv_basis, mel_spec) signal = griffin_lim(mag.unsqueeze(0), mel_loader.stft.stft_fn, n_iters=60).squeeze(0).numpy() print(signal.shape) return signal
def prepare_dataloaders(experiment, hparams, requires_durations): # Get data, data loaders and collate function ready trainset = TextMelLoader("train", experiment, hparams, requires_durations) valset = TextMelLoader("valid", experiment, hparams, requires_durations) collate_fn = TextMelCollate(hparams.n_frames_per_step) if hparams.distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) return train_loader, trainset, valset, collate_fn
def prepare_dataloaders(hparams, audio_offset=0): # Get data, data loaders and collate function ready trainset = TextMelLoader(hparams.training_files, hparams, TBPTT=False, check_files=False, verbose=True, audio_offset=audio_offset) collate_fn = TextMelCollate(hparams) if hparams.distributed_run: train_sampler = DistributedSampler(trainset, shuffle=False) else: train_sampler = None train_loader = DataLoader(trainset, num_workers=hparams.num_workers, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, # default pin_memory=False, True should allow async memory transfers # Causes very random CUDA errors (after like 4+ hours) drop_last=True, collate_fn=collate_fn) return train_loader, None, collate_fn, train_sampler, trainset
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): hparams = create_hparams() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) testset = TextMelLoader(text_files, hparams) collate_fn = TextMelCollate() test_loader = DataLoader(testset, num_workers=0, shuffle=False, sampler=None, batch_size=1, pin_memory=False, drop_last=True, collate_fn=collate_fn) waveglow = torch.load(waveglow_path)['model'] # waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, batch in enumerate(test_loader): text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) enc_outputs, _ = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel = torch.autograd.Variable(mel.cuda()) # mel = torch.unsqueeze(mel, 0) # mel = mel.half() if is_fp16 else mel with torch.no_grad(): mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma) '''if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE''' # audio = audio.squeeze() # mel = mel.cpu().numpy() # audio = audio.astype('int16') print(mel) mel = mel.squeeze() print(mel.size()) mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i)) torch.save(mel, mel_path) print(mel_path)
def process_one(index): global text_mel_loader global metadata_path global output_dir if text_mel_loader is None: text_mel_loader = TextMelLoader(metadata_path, hparams=hp, mode='preprocess') text, mel, speaker_id, f0 = text_mel_loader[index] onedir = output_dir.joinpath(format_index(index)) onedir.mkdir(exist_ok=True) tpath = onedir.joinpath("text.npy") mpath = onedir.joinpath("mel.npy") spath = onedir.joinpath("speaker.npy") fpath = onedir.joinpath("f0.npy") np.save(tpath, text.numpy(), allow_pickle=False) np.save(mpath, mel.numpy(), allow_pickle=False) np.save(spath, speaker_id.numpy(), allow_pickle=False) np.save(fpath, f0.numpy(), allow_pickle=False) return index
def main(): hps = utils.get_hparams() logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) torch.manual_seed(hps.train.seed) train_dataset = TextMelLoader(hps.data.training_files, hps.data) collate_fn = TextMelCollate(1) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=True, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn) generator = FlowGenerator_DDI(speaker_dim=hps.model.speaker_embedding, n_vocab=len(symbols), out_channels=hps.data.n_mel_channels, **hps.model).cuda() optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) generator.train() for batch_idx, (x, x_lengths, y, y_lengths, speaker_embedding) in enumerate(train_loader): x, x_lengths = x.cuda(), x_lengths.cuda() y, y_lengths = y.cuda(), y_lengths.cuda() speaker_embedding = speaker_embedding.cuda() _ = generator(x, x_lengths, speaker_embedding, y, y_lengths, gen=False) break utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
def prepare_speaker_set(hparams): # Define Speakers Set speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist_22k_skipped.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *', names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME']) speakers['MELLOTRON_ID'] = speakers['ID'].apply( lambda x: speaker_ids[x] if x in speaker_ids else -1) female_speakers = cycle( speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) male_speakers = cycle( speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) return female_speakers, male_speakers
def main(): hps = utils.get_hparams() logger = utils.get_logger(hps.model_dir) logger.info(hps) utils.check_git_hash(hps.model_dir) torch.manual_seed(hps.train.seed) train_dataset = TextMelLoader(hps.data.training_files, hps.data) collate_fn = TextMelCollate(1) train_loader = DataLoader(train_dataset, num_workers=8, shuffle=True, batch_size=hps.train.batch_size, pin_memory=True, drop_last=True, collate_fn=collate_fn) generator = FlowGenerator_DDI( len(symbols), out_channels=hps.data.n_mel_channels, **hps.model).cuda() optimizer_g = commons.Adam(generator.parameters(), scheduler=hps.train.scheduler, dim_model=hps.model.hidden_channels, warmup_steps=hps.train.warmup_steps, lr=hps.train.learning_rate, betas=hps.train.betas, eps=hps.train.eps) generator.train() for batch_idx, (x, x_lengths, y, y_lengths) in enumerate(train_loader): x, x_lengths = x.cuda(), x_lengths.cuda() y, y_lengths = y.cuda(), y_lengths.cuda() _ = generator(x, x_lengths, y, y_lengths, gen=False) break # check for pretrained and load it without a an optimizer pretrained_checkpoint_path = os.path.join(hps.model_dir, "pretrained.pth") if os.path.isfile(pretrained_checkpoint_path): logger.info("Loading pretrained checkpoint: %s" % pretrained_checkpoint_path) model, optimizer, learning_rate, iteration = utils.load_checkpoint(pretrained_checkpoint_path, generator) utils.save_checkpoint(model, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth")) else: utils.save_checkpoint(generator, optimizer_g, hps.train.learning_rate, 0, os.path.join(hps.model_dir, "ddi_G.pth"))
hparams = create_hparams() hparams.batch_size = 1 stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) speaker = "nes" checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000' model = initiate_model(hparams).cuda().eval() model.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') test_text_path = 'filelists/emotion/neutral2.txt' test_set = TextMelLoader(test_text_path, hparams) datacollate = TextMelCollate(1) dataloader = DataLoader(test_set, num_workers=1, shuffle=False, batch_size=1, pin_memory=False, drop_last=False, collate_fn=datacollate) speaker_ids = TextMelLoader(hparams.training_files, hparams).speaker_ids speaker_id = torch.LongTensor([speaker_ids[speaker]]).cuda() pytorch_total_params = sum(p.numel() for p in model.parameters()) print("total_num_params: {}".format(pytorch_total_params)) waveglow_total_params = sum(p.numel() for p in waveglow.parameters()) print("waveglow_num_params: {}".format(waveglow_total_params))
"--skip_existing", type=bool, default=True, help="Whether to overwrite existing files with the same name. ") parser.add_argument( "--hparams", type=str, default="", help= "Hyperparameter overrides as a comma-separated list of name-value pairs" ) args = parser.parse_args() metadata_path = args.metadata_path text_mel_loader = TextMelLoader(metadata_path, hparams=hp, mode='preprocess') output_dir = args.output_dir output_dir.mkdir(exist_ok=True, parents=True) fpath = output_dir.joinpath('speaker_ids.json') speaker_ids = text_mel_loader.speaker_ids json.dump(speaker_ids, open(fpath, 'wt', encoding='utf8'), indent=4, ensure_ascii=False) # Preprocess the dataset process_many(args.n_processes, skip_existing=args.skip_existing)