def test_minmax(): # Pick linguistic features for testing X, _ = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_min, X_max = P.minmax(X) assert np.isfinite(X_min).all() assert np.isfinite(X_max).all() x = X[0] x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99)) assert np.max(x_scaled) <= 1 assert np.min(x_scaled) >= 0 assert np.isfinite(x_scaled).all() # Need to specify (min, max) or (scale_, min_) @raises(ValueError) def __test_raise1(x, X_min, X_max): P.minmax_scale(x) @raises(ValueError) def __test_raise2(x, X_min, X_max): P.inv_minmax_scale(x) __test_raise1(x, X_min, X_max) __test_raise2(x, X_min, X_max) # Explicit scale_ and min_ min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99)) x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_) assert np.allclose(x_scaled, x_scaled_hat) # For padded dataset X, _ = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_min_hat, X_max_hat = P.minmax(X, lengths) assert np.allclose(X_min, X_min_hat) assert np.allclose(X_max, X_max_hat) # Inverse transform x = X[0] x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max) assert np.allclose(x, x_hat) x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_) assert np.allclose(x, x_hat)
def _get_x_stat(self): return minmax(self.xs, self._get_lengths(self.xs))
last_sample_count=last_sample_count) data_std = np.sqrt(data_var) np.save(join(data_dir, "data_mean"), data_mean) np.save(join(data_dir, "data_var"), data_var) if hp.generator_params["in_dim"] is None: hp.generator_params["in_dim"] = data_mean.shape[-1] if hp.generator_params["out_dim"] is None: hp.generator_params["out_dim"] = data_mean.shape[-1] # Dataset loaders dataset_loaders = get_vc_data_loaders(X, Y, data_mean, data_std) else: ty = "acoustic" if hp == hparams.tts_acoustic else "duration" X_data_min, X_data_max = P.minmax(X[phase]) Y_data_mean, Y_data_var = P.meanvar(Y[phase]) Y_data_std = np.sqrt(Y_data_var) np.save(join(data_dir, "X_{}_data_min".format(ty)), X_data_min) np.save(join(data_dir, "X_{}_data_max".format(ty)), X_data_max) np.save(join(data_dir, "Y_{}_data_mean".format(ty)), Y_data_mean) np.save(join(data_dir, "Y_{}_data_var".format(ty)), Y_data_var) if hp.generator_params["in_dim"] is None: hp.generator_params["in_dim"] = X_data_min.shape[-1] if hp.generator_params["out_dim"] is None: hp.generator_params["out_dim"] = Y_data_mean.shape[-1] if hp.discriminator_params["in_dim"] is None: sizes = get_static_stream_sizes(hp.stream_sizes, hp.has_dynamic_features,
FeatureFileSource(os.path.join(DATA_ROOT, "X_{}".format(ty)), dim=x_dim)) Y[ty] = FileSourceDataset( FeatureFileSource(os.path.join(DATA_ROOT, "Y_{}".format(ty)), dim=y_dim)) # this triggers file loads, but can be neglectable in terms of performance. utt_lengths[ty] = [len(x) for x in X[ty]] X_min = {} X_max = {} Y_mean = {} Y_var = {} Y_scale = {} for typ in ["acoustic", "duration"]: X_min[typ], X_max[typ] = minmax(X[typ], utt_lengths[typ]) Y_mean[typ], Y_var[typ] = meanvar(Y[typ], utt_lengths[typ]) Y_scale[typ] = np.sqrt(Y_var[typ]) fname_list = [ 'X_min.pkl', 'X_max.pkl', 'Y_mean.pkl', 'Y_var.pkl', 'Y_scale.pkl' ] with ExitStack() as stack: f = [ stack.enter_context(open(os.path.join(DATA_ROOT, fname), 'wb')) for fname in fname_list ] pickle.dump(X_min, f[0]) pickle.dump(X_max, f[1]) pickle.dump(Y_mean, f[2])
train=train)) Y[ty][phase] = FileSourceDataset( BinaryFileSource(join(DATA_ROOT, "Y_{}".format(ty)), dim=y_dim, train=train)) utt_lengths[ty][phase] = np.array([len(x) for x in X[ty][phase]], dtype=np.int) X_min = {} X_max = {} Y_mean = {} Y_var = {} Y_scale = {} for typ in ["acoustic"]: X_min[typ], X_max[typ] = minmax(X[typ]["train"], utt_lengths[typ]["train"]) Y_mean[typ], Y_var[typ] = meanvar(Y[typ]["train"], utt_lengths[typ]["train"]) Y_scale[typ] = np.sqrt(Y_var[typ]) from torch.utils import data as data_utils import torch from torch import nn from torch.autograd import Variable from tqdm import tnrange, tqdm from torch import optim import torch.nn.functional as F z_dim = args.z_dim dropout = args.dropout_ratio
def create_loader(test=False): DATA_ROOT = "./data/basic5000" X = {"acoustic": {}} Y = {"acoustic": {}} utt_lengths = {"acoustic": {}} for ty in ["acoustic"]: for phase in ["train", "test"]: train = phase == "train" x_dim = (duration_linguistic_dim if ty == "duration" else acoustic_linguisic_dim) y_dim = duration_dim if ty == "duration" else acoustic_dim X[ty][phase] = FileSourceDataset( BinaryFileSource(join(DATA_ROOT, "X_{}".format(ty)), dim=x_dim, train=train)) Y[ty][phase] = FileSourceDataset( BinaryFileSource(join(DATA_ROOT, "Y_{}".format(ty)), dim=y_dim, train=train)) utt_lengths[ty][phase] = np.array([len(x) for x in X[ty][phase]], dtype=np.int) X_min = {} X_max = {} Y_mean = {} Y_var = {} Y_scale = {} for typ in ["acoustic"]: X_min[typ], X_max[typ] = minmax(X[typ]["train"], utt_lengths[typ]["train"]) Y_mean[typ], Y_var[typ] = meanvar(Y[typ]["train"], utt_lengths[typ]["train"]) Y_scale[typ] = np.sqrt(Y_var[typ]) mora_index_lists = sorted( glob(join("data/basic5000/mora_index", "squeezed_*.csv"))) mora_index_lists_for_model = [ np.loadtxt(path).reshape(-1) for path in mora_index_lists ] train_mora_index_lists = [] test_mora_index_lists = [] test_not_valid = [] for i, mora_i in enumerate(mora_index_lists_for_model): if (i - 1) % 20 == 0: # test if test: test_not_valid.append(i) else: pass elif i % 20 == 0: # valid test_mora_index_lists.append(mora_i) else: train_mora_index_lists.append(mora_i) X_acoustic_train = [ minmax_scale( X["acoustic"]["train"][i], X_min["acoustic"], X_max["acoustic"], feature_range=(0.01, 0.99), ) for i in range(len(X["acoustic"]["train"])) ] Y_acoustic_train = [y for y in Y["acoustic"]["train"]] # Y_acoustic_train = [scale(Y["acoustic"]["train"][i], Y_mean["acoustic"], Y_scale["acoustic"]) for i in range(len(Y["acoustic"]["train"]))] train_mora_index_lists = [ train_mora_index_lists[i] for i in range(len(train_mora_index_lists)) ] X_acoustic_test = [ minmax_scale( X["acoustic"]["test"][i], X_min["acoustic"], X_max["acoustic"], feature_range=(0.01, 0.99), ) for i in range(len(X["acoustic"]["test"])) ] Y_acoustic_test = [y for y in Y["acoustic"]["test"]] # Y_acoustic_test = [scale(Y["acoustic"]["test"][i], Y_mean["acoustic"], Y_scale["acoustic"])for i in range(len(Y["acoustic"]["test"]))] test_mora_index_lists = [ test_mora_index_lists[i] for i in range(len(test_mora_index_lists)) ] train_loader = [[ X_acoustic_train[i], Y_acoustic_train[i], train_mora_index_lists[i] ] for i in range(len(train_mora_index_lists))] test_loader = [[ X_acoustic_test[i], Y_acoustic_test[i], test_mora_index_lists[i] ] for i in range(len(test_mora_index_lists))] if test: return train_loader, test_loader, test_not_valid_loader else: return train_loader, test_loader
Y_silenceIdx = FileSourceDataset(SilenceSampleIdxSource(data_root=DATA_ROOT, frame_shift_in_micro_sec=625)) X_linguistic = FileSourceDataset(LinguisticSource(data_root=DATA_ROOT, question_path=QUESTION_PATH)) X_pyworld = FileSourceDataset(PyworldSource(data_root=DATA_ROOT)) X_melmfcc = FileSourceDataset(MelspecMfccSource(data_root=DATA_ROOT)) print('X_lingusitc with dimension = {}'.format(X_linguistic[0].shape[1])) print('X_pyworld with dimension = {}'.format(X_pyworld[0].shape[1])) print('X_melmfcc with dimension = {}'.format(X_melmfcc[0].shape[1])) # Calculate Scale factors: print('Calculating scale factors: This process will take longer than 10 minutes...') #wav_len = [len(y) for y in Y] #y_min, y_max = minmax(Y, wav_len) scale_factors = {} scale_factors['linguistic_len'] = [len(x) for x in X_linguistic] scale_factors['linguistic_min'], scale_factors['linguistic_max'] = minmax(X_linguistic, scale_factors['linguistic_len']) scale_factors['pyworld_len'] = [len(x) for x in X_pyworld] scale_factors['pyworld_mean'], scale_factors['pyworld_var'] = meanvar(X_pyworld, scale_factors['pyworld_len']) scale_factors['pyworld_std'] = np.sqrt(scale_factors['pyworld_var']) scale_factors['pyworld_min'], scale_factors['pyworld_max'] = minmax(X_pyworld, scale_factors['pyworld_len']) scale_factors['melmfcc_mean'], scale_factors['melmfcc_var'] = meanvar(X_melmfcc, scale_factors['pyworld_len']) scale_factors['melmfcc_std'] = np.sqrt(scale_factors['melmfcc_var']) scale_factors['melmfcc_min'], scale_factors['melmfcc_max'] = minmax(X_melmfcc, scale_factors['pyworld_len']) np.save(DST_ROOT + 'scale_factors.npy', scale_factors) ''' To load scale_factors: scale_factors = np.load(DST_ROOT + 'scale_factors.npy').item() ''' scale_factors = np.load(DST_ROOT + 'scale_factors.npy').item() # <wav>
if not exists(checkpoint_dir): os.makedirs(checkpoint_dir) X = [] Y = [] # utt_lengths = [] X = FileSourceDataset(TextDataSource()) Mel = FileSourceDataset(MelSpecDataSource()) Y = FileSourceDataset(LinearSpecDataSource()) print("Size of dataset for {}: {}".format(phase, len(X))) ty = "acoustic" if hp == hparams_gan.tts_acoustic else "duration" X_data_min, X_data_max = P.minmax( X ) Mel_data_mean, Mel_data_var = P.meanvar( Mel ) Mel_data_std = np.sqrt( Mel_data_var ) np.save(join(data_dir, "X_{}_data_min".format(ty)), X_data_min) np.save(join(data_dir, "X_{}_data_max".format(ty)), X_data_max) np.save(join(data_dir, "Mel_{}_data_mean".format(ty)), Mel_data_mean) np.save(join(data_dir, "Mel_{}_data_var".format(ty)), Mel_data_var) if hp.discriminator_params["in_dim"] is None: sizes = get_static_stream_sizes( hp.stream_sizes, hp.has_dynamic_features, len(hp.windows)) D = int(np.array(sizes[hp.adversarial_streams]).sum()) if hp.adversarial_streams[0]: D -= hp.mask_nth_mgc_for_adv_loss if hp.discriminator_linguistic_condition: