def get_data_loaders(dump_root, speaker_id, test_shuffle=True): data_loaders = {} local_conditioning = hparams.cin_channels > 0 if hparams.max_time_steps is not None: max_steps = ensure_divisible(hparams.max_time_steps, audio.get_hop_size(), True) else: max_steps = None for phase in ["train_no_dev", "dev"]: train = phase == "train_no_dev" X = FileSourceDataset( RawAudioDataSource(join(dump_root, phase), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(join(dump_root, phase), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("[{}]: length of the dataset is {}".format(phase, len(X))) if train: lengths = np.array(X.file_data_source.lengths) # Prepare sampler sampler = PartialyRandomizedSimilarTimeLengthSampler( lengths, batch_size=hparams.batch_size) shuffle = False # make sure that there's no sorting bugs for https://github.com/r9y9/wavenet_vocoder/issues/130 sampler_idx = np.asarray(sorted(list(map(lambda s: int(s), sampler)))) assert (sampler_idx == np.arange(len(sampler_idx), dtype=np.int)).all() else: sampler = None shuffle = test_shuffle dataset = PyTorchDataset(X, Mel) data_loader = data_utils.DataLoader( dataset, batch_size=hparams.batch_size, drop_last=True, num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle, collate_fn=collate_fn, pin_memory=hparams.pin_memory) speaker_ids = {} if X.file_data_source.multi_speaker: for idx, (x, c, g) in enumerate(dataset): if g is not None: try: speaker_ids[g] += 1 except KeyError: speaker_ids[g] = 1 if len(speaker_ids) > 0: print("Speaker stats:", speaker_ids) data_loaders[phase] = data_loader return data_loaders
def get_data_loaders(config): data_loaders = {} for phase in ["train_no_dev", "dev"]: in_dir = to_absolute_path(config.data[phase].in_dir) out_dir = to_absolute_path(config.data[phase].out_dir) train = phase.startswith("train") in_feats = FileSourceDataset(NpyFileSource(in_dir)) out_feats = FileSourceDataset(NpyFileSource(out_dir)) in_feats = MemoryCacheDataset(in_feats, cache_size=10000) out_feats = MemoryCacheDataset(out_feats, cache_size=10000) dataset = Dataset(in_feats, out_feats) data_loaders[phase] = data_utils.DataLoader( dataset, batch_size=config.data.batch_size, collate_fn=collate_fn, pin_memory=config.data.pin_memory, num_workers=config.data.num_workers, shuffle=train) for x, y, l in data_loaders[phase]: logger.info(f"{x.shape}, {y.shape}, {l.shape}") return data_loaders
def test_real_metrics(): _, source = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(source) lengths = [len(x) for x in X] X = X.asarray() mgc = X[:, :, :source.mgc_dim // 3] lf0 = X[:, :, source.lf0_start_idx] vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int) bap = X[:, :, source.bap_start_idx] mgc_tgt = mgc + 0.01 lf0_tgt = lf0 + 0.01 vuv_tgt = vuv.copy() bap_tgt = bap + 0.01 mcd = metrics.melcd(mgc, mgc_tgt, lengths) bap_mcd = metrics.melcd(bap, bap_tgt, lengths) lf0_mse = metrics.lf0_mean_squared_error(lf0, vuv, lf0_tgt, vuv_tgt, lengths) vuv_err = metrics.vuv_error(vuv, vuv_tgt) assert mcd > 0 assert bap_mcd > 0 assert lf0_mse > 0 assert vuv_err == 0.0
def get_data_loaders(data_root, speaker_id, test_shuffle=True): data_loaders = {} local_conditioning = hparams.cin_channels > 0 for phase in ["train", "test"]: train = phase == "train" X = FileSourceDataset( RawAudioDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(data_root, speaker_id=speaker_id, train=train, test_size=hparams.test_size, test_num_samples=hparams.test_num_samples, random_state=hparams.random_state)) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("[{}]: length of the dataset is {}".format(phase, len(X))) if train: lengths = np.array(X.file_data_source.lengths) # Prepare sampler sampler = PartialyRandomizedSimilarTimeLengthSampler( lengths, batch_size=hparams.batch_size) shuffle = False else: sampler = None shuffle = test_shuffle dataset = PyTorchDataset(X, Mel) data_loader = data_utils.DataLoader(dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, sampler=sampler, shuffle=shuffle, collate_fn=collate_fn, pin_memory=hparams.pin_memory) speaker_ids = {} if X.file_data_source.multi_speaker: for idx, (x, c, g) in enumerate(dataset): if g is not None: try: speaker_ids[g] += 1 except KeyError: speaker_ids[g] = 1 if len(speaker_ids) > 0: print("Speaker stats:", speaker_ids) data_loaders[phase] = data_loader return data_loaders
def __init__(self, data_root_dir=DATA_ROOT, train_flag=True, cond_sel='mfcc', cache_size=1000, transform=None): self.train_flag = train_flag self.cond_sel = cond_sel # 'mfcc' or 'pyspec' self.cache_size= cache_size self.data_root_dir = data_root_dir if self.train_flag is True: self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[0,1000])) else: self.X = FileSourceDataset(WavSource(data_root=data_root_dir, file_sel_range=[1000,1132])) self.cache_size = 1 self.utt_lengths = [len(utt) for utt in self.X] self.X_raw = MemoryCacheFramewiseDataset(self.X, self.utt_lengths, self.cache_size) self.utt_total_length = len(self.X_raw) self.sample_start, self.sample_end = list(), list() # # This initializes self.sample_start and self.sample_end if self.train_flag is True: self.rand_flush() else: self.init_for_test() # Feature scaling factors scf = np.load(self.data_root_dir + '../processed_slt_arctic/scale_factors.npy').item() self.pyspec_max = np.max(scf['pyworld_max'][64:64+513]) #11.159795 self.mfcc_mean = scf['melmfcc_mean'][128:128+25] self.mfcc_std = scf['melmfcc_std'][128:128+25] return None
def test_jsut(): DATA_DIR = join(expanduser("~"), "data", "jsut_ver1.1") if not exists(DATA_DIR): warn("Data doesn't exist at {}".format(DATA_DIR)) return class MyTextDataSource(jsut.TranscriptionDataSource): def __init__(self, data_root, subsets): super(MyTextDataSource, self).__init__(data_root, subsets) def collect_features(self, text): return text data_source = MyTextDataSource(DATA_DIR, subsets=["basic5000"]) X1 = FileSourceDataset(data_source) assert X1[0] == u"水をマレーシアから買わなくてはならないのです。" data_source = MyTextDataSource(DATA_DIR, subsets=["travel1000"]) X2 = FileSourceDataset(data_source) assert X2[0] == u"あなたの荷物は、ロサンゼルスに残っています。" # Multiple subsets data_source = MyTextDataSource(DATA_DIR, subsets=["basic5000", "travel1000"]) X3 = FileSourceDataset(data_source) assert X3[0] == u"水をマレーシアから買わなくてはならないのです。" assert len(X3) == len(X1) + len(X2) # All subsets data_source = MyTextDataSource(DATA_DIR, subsets=jsut.available_subsets) X = FileSourceDataset(data_source) # As of 2017/11/2. There were 30 missing wav files. # This should be 7696 assert len(X) == 7696 class MyWavFileDataSource(jsut.WavFileDataSource): def __init__(self, data_root, subsets): super(MyWavFileDataSource, self).__init__(data_root, subsets) self.alpha = pysptk.util.mcepalpha(48000) def collect_features(self, path): fs, x = wavfile.read(path) assert fs == 48000 x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha) return mc.astype(np.float32) data_source = MyWavFileDataSource(DATA_DIR, subsets=["basic5000"]) X = FileSourceDataset(data_source) print(X[0].shape)
def test_vcc2016(): DATA_DIR = join(expanduser("~"), "data", "vcc2016") if not exists(DATA_DIR): warn("Data doesn't exist at {}".format(DATA_DIR)) return class MyFileDataSource(vcc2016.WavFileDataSource): def __init__(self, data_root, speakers, labelmap=None, max_files=2): super(MyFileDataSource, self).__init__( data_root, speakers, labelmap=labelmap, max_files=max_files) self.alpha = pysptk.util.mcepalpha(16000) def collect_features(self, path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha) return mc.astype(np.float32) max_files = 10 data_source = MyFileDataSource( DATA_DIR, speakers=["SF1"], max_files=max_files) X = FileSourceDataset(data_source) assert len(X) == max_files print(X[0].shape) # warmup collect_features path # Multi speakers data_source = MyFileDataSource( DATA_DIR, speakers=["SF1", "SF2"], max_files=max_files) X = FileSourceDataset(data_source) assert len(X) == max_files # Speaker labels Y = data_source.labels assert np.all(Y[:max_files // 2] == 0) assert np.all(Y[max_files // 2:] == 1) # Custum speaker id data_source = MyFileDataSource( DATA_DIR, speakers=["SF1", "SF2"], max_files=max_files, labelmap={"SF1": 1, "SF2": 0}) X = FileSourceDataset(data_source) Y = data_source.labels assert np.all(Y[:max_files // 2] == 1) assert np.all(Y[max_files // 2:] == 0) # Use all data data_source = MyFileDataSource( DATA_DIR, speakers=["SF1", "SF2"], max_files=None) X = FileSourceDataset(data_source) assert len(X) == 162 * 2
def test_dtw_frame_length_adjastment(): _, X = example_file_data_sources_for_duration_model() X = FileSourceDataset(X) X_unaligned = X.asarray() # This should trigger frame length adjastment Y_unaligned = np.pad(X_unaligned, [(0, 0), (5, 0), (0, 0)], mode="constant", constant_values=0) Y_unaligned = Y_unaligned[:, :-5, :] for aligner in [DTWAligner(), IterativeDTWAligner( n_iter=1, max_iter_gmm=1, n_components_gmm=1)]: X_aligned, Y_aligned = aligner.transform((X_unaligned, Y_unaligned)) assert X_aligned.shape == Y_aligned.shape
def _get_small_datasets(padded=False, duration=False, padded_length=1000): if duration: X, Y = example_file_data_sources_for_duration_model() else: X, Y = example_file_data_sources_for_acoustic_model() if padded: X = PaddedFileSourceDataset(X, padded_length=padded_length) Y = PaddedFileSourceDataset(Y, padded_length=padded_length) else: X = FileSourceDataset(X) Y = FileSourceDataset(Y) return X, Y
def get_data_loaders(dump_root, speaker_id, hparams=None, rank_id=None, group_size=None): """create train dataset""" local_conditioning = hparams.cin_channels > 0 if hparams.max_time_steps is not None: max_steps = ensure_divisible(hparams.max_time_steps, audio.get_hop_size(), True) else: max_steps = None X = FileSourceDataset( RawAudioDataSource(os.path.join(dump_root, 'train_no_dev'), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(os.path.join(dump_root, 'train_no_dev'), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("length of the dataset is {}".format(len(X))) length_x = np.array(X.file_data_source.lengths) dataset = DualDataset(X, Mel, length_x, batch_size=hparams.batch_size, hparams=hparams) sampler = DistributedSampler(dataset, rank_id, group_size, shuffle=True, seed=0) data_loaders = de.GeneratorDataset( dataset, ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"], sampler=sampler) return data_loaders
def test_ljspeech(): DATA_DIR = join(expanduser("~"), "data", "LJSpeech-1.0") if not exists(DATA_DIR): warn("Data doesn't exist at {}".format(DATA_DIR)) return class MyTextDataSource(ljspeech.TranscriptionDataSource): def __init__(self, data_root): super(MyTextDataSource, self).__init__(data_root) def collect_features(self, text): return text class MyNormalizedTextDataSource(ljspeech.NormalizedTranscriptionDataSource ): def __init__(self, data_root): super(MyNormalizedTextDataSource, self).__init__(data_root) def collect_features(self, text): return text data_source = MyTextDataSource(DATA_DIR) X = FileSourceDataset(data_source) assert X[1] == "in being comparatively modern." data_source = MyNormalizedTextDataSource(DATA_DIR) X = FileSourceDataset(data_source) assert X[1] == "in being comparatively modern." class MyWavFileDataSource(ljspeech.WavFileDataSource): def __init__(self, data_root): super(MyWavFileDataSource, self).__init__(data_root) self.alpha = pysptk.util.mcepalpha(22050) def collect_features(self, path): fs, x = wavfile.read(path) assert fs == 22050 x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha) return mc.astype(np.float32) data_source = MyWavFileDataSource(DATA_DIR) X = FileSourceDataset(data_source) print(X[0].shape)
def infer(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = Modern_DBLSTM_1(args).to(device) mfcc_x_test = FileSourceDataset(MFCCSource(args.wav_dir)) dataset_test = InferenceDataset(mfcc_x_test) test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=4) model.load_state_dict(torch.load(args.model_name)) for i, sample in enumerate(test_loader): inputs = sample['speech'].to(device) wav, filename = mfcc_x_test[i] filename_save = join(args.save_dir,split(filename)[1].split(".")[0]) predicted = model(inputs).detach().cpu().numpy() plt.plot(predicted[0,:,:]) plt.show() np.save(filename_save,predicted[0,:,:])
def test_meanvar(): # Pick acoustic features for testing _, X = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_mean, X_var = P.meanvar(X) X_std = np.sqrt(X_var) assert np.isfinite(X_mean).all() assert np.isfinite(X_var).all() assert X_mean.shape[-1] == D assert X_var.shape[-1] == D _, X_std_hat = P.meanstd(X) assert np.allclose(X_std, X_std_hat) x = X[0] x_scaled = P.scale(x, X_mean, X_std) assert np.isfinite(x_scaled).all() # For padded dataset _, X = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_mean_hat, X_var_hat = P.meanvar(X, lengths) assert np.allclose(X_mean, X_mean_hat) assert np.allclose(X_var, X_var_hat) # Inverse transform x = X[0] x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std) assert np.allclose(x, x_hat, atol=1e-5)
def Dataloader(data_root, meta_text): # Input dataset definitions X = FileSourceDataset(TextDataSource(data_root, meta_text)) Mel = FileSourceDataset(MelSpecDataSource(data_root, meta_text)) Y = FileSourceDataset(LinearSpecDataSource(data_root, meta_text)) # Dataset and Dataloader setup dataset = PyTorchDatasetWrapper(X, Mel, Y) data_loader = data.DataLoader(dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=config.pin_memory) return data_loader
def get_data_loader(data_dir, collate_fn): wav_paths = glob(join(data_dir, "*-wave.npy")) if len(wav_paths) != 0: X = FileSourceDataset( RawAudioDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparams.cin_pad)) else: X = None C = FileSourceDataset( MelSpecDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparams.cin_pad)) # C = None # No audio found: if X is None: assert len(C) > 0 data_loader = data_utils.DataLoader(C, batch_size=hparams.batch_size, drop_last=False, num_workers=hparams.num_workers, sampler=None, shuffle=False, collate_fn=dummy_collate, pin_memory=hparams.pin_memory) else: assert len(X) == len(C) if C[0].shape[-1] != hparams.cin_channels: raise RuntimeError( """Invalid cin_channnels {}. Expectd to be {}.""".format( hparams.cin_channels, C[0].shape[-1])) dataset = PyTorchDataset(X, C) data_loader = data_utils.DataLoader(dataset, batch_size=hparams.batch_size, drop_last=False, num_workers=0, sampler=None, shuffle=True, collate_fn=collate_fn, pin_memory=hparams.pin_memory) return data_loader
def initialize_training(checkpoint_path): # Input dataset definitions X = FileSourceDataset(TextDataSource()) Mel = FileSourceDataset(MelSpecDataSource()) Y = FileSourceDataset(LinearSpecDataSource()) # Dataset and Dataloader setup dataset = PyTorchDataset(X, Mel, Y) data_loader = data.DataLoader(dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=config.pin_memory) # Model model = Tacotron(n_vocab=len(symbols), embedding_dim=config.embedding_dim, mel_dim=config.num_mels, linear_dim=config.num_freq, r=config.outputs_per_step, padding_idx=config.padding_idx, use_memory_mask=config.use_memory_mask) optimizer = optim.Adam(model.parameters(), lr=config.initial_learning_rate, betas=(config.adam_beta1, config.adam_beta2), weight_decay=config.weight_decay) # Load checkpoint if checkpoint_path != None: print("Load checkpoint from: {}".format(checkpoint_path)) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) try: global_step = checkpoint["global_step"] global_epoch = checkpoint["global_epoch"] except: print('Warning: global step and global epoch unable to restore!') sys.exit(0) return model, optimizer, data_loader
def test_empty_dataset(): class EmptyDataSource(FileDataSource): def collect_files(self): return [] def collect_features(path): pass X = FileSourceDataset(EmptyDataSource()) def __test_outof_range(X): print(X[0]) # Should raise IndexError yield raises(IndexError)(__test_outof_range), X
def test_minmax(): # Pick linguistic features for testing X, _ = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_min, X_max = P.minmax(X) assert np.isfinite(X_min).all() assert np.isfinite(X_max).all() x = X[0] x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99)) assert np.max(x_scaled) <= 1 assert np.min(x_scaled) >= 0 assert np.isfinite(x_scaled).all() # Need to specify (min, max) or (scale_, min_) @raises(ValueError) def __test_raise1(x, X_min, X_max): P.minmax_scale(x) @raises(ValueError) def __test_raise2(x, X_min, X_max): P.inv_minmax_scale(x) __test_raise1(x, X_min, X_max) __test_raise2(x, X_min, X_max) # Explicit scale_ and min_ min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99)) x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_) assert np.allclose(x_scaled, x_scaled_hat) # For padded dataset X, _ = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_min_hat, X_max_hat = P.minmax(X, lengths) assert np.allclose(X_min, X_min_hat) assert np.allclose(X_max, X_max_hat) # Inverse transform x = X[0] x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max) assert np.allclose(x, x_hat) x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_) assert np.allclose(x, x_hat)
def get_data_loader(hparam, data_dir): """ test data loader """ wav_paths = glob.glob(os.path.join(data_dir, "*-wave.npy")) if wav_paths: X = FileSourceDataset(RawAudioDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparam.cin_pad)) else: X = None C = FileSourceDataset(MelSpecDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparam.cin_pad)) length_x = np.array(C.file_data_source.lengths) if C[0].shape[-1] != hparam.cin_channels: raise RuntimeError("Invalid cin_channnels {}. Expected to be {}.".format(hparam.cin_channels, C[0].shape[-1])) dataset = DualDataset(X, C, length_x, batch_size=hparam.batch_size, hparams=hparam) data_loader = de.GeneratorDataset(dataset, ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"]) return data_loader, dataset
def my_app(config : DictConfig) -> None: global logger logger = getLogger(config.verbose) logger.info(config.pretty()) device = torch.device("cuda" if use_cuda else "cpu") in_dir = to_absolute_path(config.in_dir) out_dir = to_absolute_path(config.out_dir) os.makedirs(out_dir, exist_ok=True) model_config = OmegaConf.load(to_absolute_path(config.model.model_yaml)) model = hydra.utils.instantiate(model_config.netG).to(device) checkpoint = torch.load(to_absolute_path(config.model.checkpoint), map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint["state_dict"]) scaler = joblib.load(to_absolute_path(config.out_scaler_path)) in_feats = FileSourceDataset(NpyFileSource(in_dir)) with torch.no_grad(): for idx in tqdm(range(len(in_feats))): feats = torch.from_numpy(in_feats[idx]).unsqueeze(0).to(device) out = model(feats, [feats.shape[1]]).squeeze(0).cpu().data.numpy() out = scaler.inverse_transform(out) # Apply MLPG if necessary if np.any(model_config.has_dynamic_features): windows = get_windows(3) out = multi_stream_mlpg( out, scaler.var_, windows, model_config.stream_sizes, model_config.has_dynamic_features) name = basename(in_feats.collected_files[idx][0]) out_path = join(out_dir, name) np.save(out_path, out, allow_pickle=False)
# Preventing Windows specific error such as MemoryError # Also reduces the occurrence of THAllocator.c 0x05 error in Widows build of PyTorch if platform.system() == "Windows": print( " [!] Windows Detected - IF THAllocator.c 0x05 error occurs SET num_workers to 1" ) assert hparams.name == "deepvoice3" print(hparams_debug_string()) _frontend = getattr(frontend, hparams.frontend) os.makedirs(checkpoint_dir, exist_ok=True) # Input dataset definitions X = FileSourceDataset(TextDataSource(data_root, speaker_id)) Mel = FileSourceDataset(MelSpecDataSource(data_root, speaker_id)) Y = FileSourceDataset(LinearSpecDataSource(data_root, speaker_id)) # Prepare sampler frame_lengths = Mel.file_data_source.frame_lengths sampler = PartialyRandomizedSimilarTimeLengthSampler( frame_lengths, batch_size=hparams.batch_size) # Dataset and Dataloader setup dataset = PyTorchDataset(X, Mel, Y) data_loader = data_utils.DataLoader(dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, sampler=sampler, collate_fn=collate_fn,
use_phone_alignment = args["--use_phone_alignment"] question_path = args["--question_path"] # Features required to train duration model # X -> Y # X: linguistic # Y: duration X_duration_source = LinguisticSource( add_frame_features=False, subphone_features=None, use_phone_alignment=use_phone_alignment, question_path=question_path) Y_duration_source = DurationFeatureSource( use_phone_alignment=use_phone_alignment) X_duration = FileSourceDataset(X_duration_source) Y_duration = FileSourceDataset(Y_duration_source) # Features required to train acoustic model # X -> Y # X: linguistic # Y: acoustic subphone_features = "full" if not use_phone_alignment else "coarse_coding" X_acoustic_source = LinguisticSource( add_frame_features=True, subphone_features=subphone_features, use_phone_alignment=use_phone_alignment, question_path=question_path) Y_acoustic_source = AcousticSource(use_phone_alignment=use_phone_alignment) X_acoustic = FileSourceDataset(X_acoustic_source) Y_acoustic = FileSourceDataset(Y_acoustic_source)
# output_mean, output_std = self.output_meanstd # input_slice[:,1:25] = (input_slice[:,1:25] - input_mean[1:25])/input_std[1:25] # output_slice[:,1:25] = (output_slice[:,1:25] - output_mean[1:25])/output_std[1:25] # input_slice_normalised = input_slice # output_slice_normalised = output_slice # Second index: selecting 24 MCEP features # Third index: randomly samping 128 frames input_tensor = torch.FloatTensor(mcep_A_normalised) output_tensor = torch.FloatTensor(mcep_B_normalised) filename_A = list(self.input_file_source.dataset.collected_files[idx]) filename_B = list(self.output_file_source.dataset.collected_files[idx]) #other = OtherParameters(f0_A,f0_B,bap_A,bap_B) return (input_tensor, output_tensor, filename_A, filename_B) if __name__ == '__main__': data_source = VCC2016DataSource("/home/boomkin/repos/Voice_Converter_CycleGAN/data", ["SF1"]) something = FileSourceDataset(data_source) print(something.collected_files[15]) print(something[0].shape) # Doesn't provide acceleration #class MyInt(int):
def __test_wrong_num_collected_files(): X = FileSourceDataset(WrongNumberOfCollectedFilesDataSource()) X[0]
def __test_wrong_num_args(): X = FileSourceDataset(WrongNumberOfArgsDataSource()) X[0]
return mgc.astype(np.float32) if __name__ == "__main__": args = docopt(__doc__) print("Command line args:\n", args) DATA_ROOT = args["<DATA_ROOT>"] source_speaker = args["<source_speaker>"] target_speaker = args["<target_speaker>"] max_files = int(args["--max_files"]) dst_dir = args["--dst_dir"] overwrite = args["--overwrite"] print(hparams_debug_string(hp)) X_dataset = FileSourceDataset(MGCSource(DATA_ROOT, [source_speaker], max_files=max_files)) Y_dataset = FileSourceDataset(MGCSource(DATA_ROOT, [target_speaker], max_files=max_files)) skip_feature_extraction = exists(join(dst_dir, "X")) \ and exists(join(dst_dir, "Y")) if overwrite: skip_feature_extraction = False if skip_feature_extraction: print("Features seems to be prepared, skipping feature extraction.") sys.exit(0) # Create dirs for speaker, name in [(source_speaker, "X"), (target_speaker, "Y")]: d = join(dst_dir, name) print("Destination dir for {}: {}".format(speaker, d))
os.makedirs(checkpoint_dir, exist_ok=True) # Vocab size phids = make_phids(DATA_ROOT + '/txt.done.data.tacotron.phseq.train') outfile = checkpoint_dir + '/ids.json' with open(outfile, 'w') as outfile: json.dump(phids, outfile) print(phids) print("Length of vocabulary: ", len(phids)) phids = dict(phids) #sys.exit() # Input dataset definitions X = FileSourceDataset( PhoneDataSource(DATA_ROOT, phids, "txt.done.data.tacotron.phseq.train")) Mel = FileSourceDataset( MelSpecDataSource(DATA_ROOT, "txt.done.data.tacotron.phseq.train")) Y = FileSourceDataset( LinearSpecDataSource(DATA_ROOT, "txt.done.data.tacotron.phseq.train")) # Dataset and Dataloader setup dataset = PyTorchDataset(X, Mel, Y) data_loader = data_utils.DataLoader(dataset, batch_size=hparams.batch_size, num_workers=hparams.num_workers, shuffle=True, collate_fn=collate_fn, pin_memory=hparams.pin_memory)
log_event_path = args["--log-event-path"] disable_slack = args["--disable-slack"] # Flags to update discriminator/generator or not update_d = w_d > 0 update_g = False if discriminator_warmup else True os.makedirs(checkpoint_dir, exist_ok=True) X = {"train": {}, "test": {}} Y = {"train": {}, "test": {}} utt_lengths = {"train": {}, "test": {}} for phase in ["train", "test"]: train = True if phase == "train" else False X[phase] = FileSourceDataset( NPYDataSource(inputs_dir, train=train, max_files=max_files)) Y[phase] = FileSourceDataset( NPYDataSource(outputs_dir, train=train, max_files=max_files)) # Assuming X and Y are time aligned. x_lengths = np.array([len(x) for x in X[phase]]) y_lengths = np.array([len(y) for y in Y[phase]]) assert np.allclose(x_lengths, y_lengths) utt_lengths[phase] = x_lengths print("Size of dataset for {}: {}".format(phase, len(X[phase]))) # Collect stats for noramlization (from training data) # if this becomes performance heavy (not now), this can be done in a separte # script phase = "train" # TODO: ugly? if hp == hparams.vc:
D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / \ np.diag(self.covarXX[m]) * np.diag(self.covarXY[m]) # Once we have mean and variance over frames, then we can do MLPG return mlpg(E, D, self.windows) source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT, speakers=["ksp"], max_files=max_files) target = CMUArcticSpectrumDataSource(data_root=DATA_ROOT, speakers=["slt"], max_files=max_files) # Build dataset as 3D tensor (NxTxD) X = FileSourceDataset(source).asarray(padded_length=1200) Y = FileSourceDataset(target).asarray(padded_length=1200) # Alignment X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y)) # Drop 1st dimention X, Y = X[:, :, 1:], Y[:, :, 1:] static_dim = X.shape[-1] X = apply_each2d_trim(delta_features, X, windows) Y = apply_each2d_trim(delta_features, Y, windows) # Joint features XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2)
def __init__(self, data_root_dir=None, train_mode=False, output_mode='melspec', transform=None, data_sel=None): self.wav_root_dir = data_root_dir + '/wavs/' self.train_mode = train_mode self.output_mode = output_mode self.transform = transform self.data_sel = data_sel self.max_len_text = MAX_LEN_TEXT self.max_len_melspec = MAX_LEN_MELSPEC self.max_len_spec = MAX_LEN_SPEC # self.max_len_paired_text = MAX_LEN_PAIRED_TEXT # Max-lengths are required for z-padding # self.max_len_paired_spec = MAX_LEN_PAIRED_SPEC # self.max_len_paired_melspec = MAX_LEN_PAIRED_MELSPEC # Preparing Text: self.text_csv_path = data_root_dir + '/metadata.csv' self.reduce_punc_table = str.maketrans( string.ascii_uppercase, string.ascii_lowercase, '0123456789!#"$%&\()*+/:;<=>?@[\\]^_`{|}~') self.chr2int_table = dict( zip(" ',-." + string.ascii_lowercase, np.arange(0, 31))) df = pd.read_csv(self.text_csv_path, index_col=False, sep='|', header=None, memory_map=True) # memory_map: speed-up reading. nan_rows = df[df[2].isnull()].index.values df.iloc[nan_rows, 2] = df.iloc[nan_rows, 1] # fixing dataset NaN value bugs... df = df.drop(1, axis=1) df.columns = ['file_id', 'text'] df = df.drop(OMIT_DATA_ROWS, axis=0).reset_index( drop=True) # Omitting foreign language.. if self.train_mode is True: self.file_ids = df.iloc[0:N_TRAIN, 0] # file_ids: LJ**-**** (13,000) self.texts = df.iloc[0:N_TRAIN, 1] if self.data_sel is not None: self.file_ids = self.file_ids[self.data_sel].reset_index( drop=True) self.texts = self.texts[self.data_sel].reset_index(drop=True) else: self.file_ids = df.iloc[N_TRAIN:, 0].reset_index(drop=True) # (100) self.texts = df.iloc[N_TRAIN:, 1].reset_index(drop=True) if self.data_sel is not None: self.file_ids = self.file_ids[self.data_sel].reset_index( drop=True) self.texts = self.texts[self.data_sel].reset_index(drop=True) # Prepraing Audio: if self.train_mode is True: self.spec_features = MemoryCacheDataset(FileSourceDataset( SpecSource(wav_data_root=self.wav_root_dir, file_sel_range=[0, N_TRAIN], output_mode=self.output_mode)), cache_size=len( self.file_ids)) else: self.spec_features = MemoryCacheDataset(FileSourceDataset( SpecSource(wav_data_root=self.wav_root_dir, file_sel_range=[N_TRAIN, None], output_mode=self.output_mode)), cache_size=len( self.file_ids)) assert (len(self.file_ids) == len(self.spec_features)) # # Pairing: Sort and divide by feature lengths, then concat small + large # lengths = np.load('mspec_length_train_13000.npy') # sorted_by_len = np.argsort(lengths) # n_org = len(sorted_by_len) # n_pairs = int(n_org / 2) # self.paired_items = list() # for i in range(n_pairs): # self.paired_items.append([sorted_by_len[i], sorted_by_len[n_org - 1 - i]]) return None
return mgc.astype(np.float32) if __name__ == "__main__": args = docopt(__doc__) print("Command line args:\n", args) DATA_ROOT = args["<DATA_ROOT>"] source_speaker = args["<source_speaker>"] target_speaker = args["<target_speaker>"] max_files = int(args["--max_files"]) dst_dir = args["--dst_dir"] overwrite = args["--overwrite"] print(hparams_debug_string(hp)) X_dataset = FileSourceDataset( MGCSource(DATA_ROOT, [source_speaker], max_files=max_files)) Y_dataset = FileSourceDataset( MGCSource(DATA_ROOT, [target_speaker], max_files=max_files)) skip_feature_extraction = exists(join(dst_dir, "X")) \ and exists(join(dst_dir, "Y")) if overwrite: skip_feature_extraction = False if skip_feature_extraction: print("Features seems to be prepared, skipping feature extraction.") sys.exit(0) # Create dirs for speaker, name in [(source_speaker, "X"), (target_speaker, "Y")]: d = join(dst_dir, name) print("Destination dir for {}: {}".format(speaker, d))