Exemple #1
0
def test_minmax():
    # Pick linguistic features for testing
    X, _ = example_file_data_sources_for_acoustic_model()
    X = FileSourceDataset(X)
    lengths = [len(x) for x in X]
    D = X[0].shape[-1]
    X_min, X_max = P.minmax(X)
    assert np.isfinite(X_min).all()
    assert np.isfinite(X_max).all()

    x = X[0]
    x_scaled = P.minmax_scale(x, X_min, X_max, feature_range=(0, 0.99))
    assert np.max(x_scaled) <= 1
    assert np.min(x_scaled) >= 0
    assert np.isfinite(x_scaled).all()

    # Need to specify (min, max) or (scale_, min_)
    @raises(ValueError)
    def __test_raise1(x, X_min, X_max):
        P.minmax_scale(x)

    @raises(ValueError)
    def __test_raise2(x, X_min, X_max):
        P.inv_minmax_scale(x)

    __test_raise1(x, X_min, X_max)
    __test_raise2(x, X_min, X_max)

    # Explicit scale_ and min_
    min_, scale_ = P.minmax_scale_params(X_min, X_max, feature_range=(0, 0.99))
    x_scaled_hat = P.minmax_scale(x, min_=min_, scale_=scale_)
    assert np.allclose(x_scaled, x_scaled_hat)

    # For padded dataset
    X, _ = example_file_data_sources_for_acoustic_model()
    X = PaddedFileSourceDataset(X, 1000)
    # Should get same results with padded features
    X_min_hat, X_max_hat = P.minmax(X, lengths)
    assert np.allclose(X_min, X_min_hat)
    assert np.allclose(X_max, X_max_hat)

    # Inverse transform
    x = X[0]
    x_hat = P.inv_minmax_scale(P.minmax_scale(x, X_min, X_max), X_min, X_max)
    assert np.allclose(x, x_hat)

    x_hat = P.inv_minmax_scale(P.minmax_scale(x, scale_=scale_, min_=min_),
                               scale_=scale_,
                               min_=min_)
    assert np.allclose(x, x_hat)
Exemple #2
0
 def __getitem__(self, idx):
     x = P.minmax_scale(self.X[idx],
                        min_=self.X_data_min,
                        scale_=self.X_data_scale,
                        feature_range=(0.01, 0.99))
     y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std)
     return x, y
Exemple #3
0
def gen_duration(device, label_path, binary_dict, continuous_dict, X_min,
                 X_max, Y_mean, Y_scale, duration_model):

    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=False,
        subphone_features=None).astype(np.float32)
    # Apply normalization
    ty = "duration"
    duration_linguistic_features = minmax_scale(duration_linguistic_features,
                                                X_min[ty],
                                                X_max[ty],
                                                feature_range=(0.01, 0.99))

    # # Apply model
    # # duration_model = duration_model.cpu()
    duration_model.eval()
    x = torch.FloatTensor(duration_linguistic_features)
    duration_predicted = duration_model(x.unsqueeze(0)).data.numpy()
    print("duration_predicted shape: {}".format(duration_predicted.shape))

    # Apply denormalization
    duration_predicted = duration_predicted * Y_scale[ty] + Y_mean[ty]
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Exemple #4
0
 def _get_x_scaled(self, dataset, seq):
     if dataset is None:
         return seq
     else:
         min_ = dataset.x_stat['min']
         max_ = dataset.x_stat['max']
         return minmax_scale(seq, min_, max_, feature_range=(0.01, 0.99))
Exemple #5
0
def tts_from_label(models,
                   label_path,
                   X_min,
                   X_max,
                   Y_mean,
                   Y_std,
                   post_filter=False,
                   apply_duration_model=True,
                   coef=1.4,
                   fs=16000):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    if use_cuda:
        duration_model = duration_model.cuda()
        acoustic_model = acoustic_model.cuda()

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(label_path, duration_model,
                                                    X_min, X_max, Y_mean,
                                                    Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(linguistic_features,
                                         X_min[ty],
                                         X_max[ty],
                                         feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy()
    acoustic_predicted = acoustic_predicted.reshape(
        -1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted,
                        Y_mean,
                        Y_std,
                        post_filter,
                        coef=coef,
                        fs=fs)
Exemple #6
0
    def __getitem__(self, idx):
        x, t = self.xs[idx], self.ts[idx]
        x = minmax_scale(x,
                         self.x_stat['min'],
                         self.x_stat['max'],
                         feature_range=(0.01, 0.99))
        t = scale(t, self.t_stat['mean'], np.sqrt(self.t_stat['var']))

        pad_x = self._padding(x)
        pad_t = self._padding(t)

        return pad_x, pad_t, len(self.xs[idx]), len(self.ts[idx])
 def __getitem__(self, index):
     file = self.metadata[index]
     x = np.load(os.path.join(self.X_path, '{}.npy'.format(file))).reshape(
         -1, self.x_dim)
     y = np.load(os.path.join(self.Y_path, '{}.npy'.format(file))).reshape(
         -1, self.y_dim)
     norm_x = minmax_scale(x,
                           self.X_min[self.train],
                           self.X_max[self.train],
                           feature_range=(0.01, 0.99))
     norm_y = scale(y, self.Y_mean[self.train], self.Y_scale[self.train])
     return norm_x, norm_y
Exemple #8
0
    def __getitem__(self, idx):
        x = P.minmax_scale(self.X[idx],
                           min_=self.X_data_min,
                           scale_=self.X_data_scale,
                           feature_range=(0.01, 0.99))
        y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std)

        # To handle inconsistent static-delta relationship after normalization
        # This is required to use MSE + MGE loss work
        if hp.recompute_delta_features:
            y = recompute_delta_features(y, self.Y_data_mean, self.Y_data_std,
                                         hp.windows, hp.stream_sizes,
                                         hp.has_dynamic_features)
        return x, y
Exemple #9
0
def lab2wav(args,
            device,
            label_path,
            binary_dict,
            continuous_dict,
            X_min,
            X_max,
            Y_mean,
            Y_var,
            Y_scale,
            duration_model,
            acoustic_model,
            post_filter=False):
    # Predict durations
    duration_modified_hts_labels = gen_duration(device, label_path,
                                                binary_dict, continuous_dict,
                                                X_min, X_max, Y_mean, Y_scale,
                                                duration_model)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=True,
        subphone_features="full"
        if args.label == 'state_align' else "coarse_coding")

    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = minmax_scale(linguistic_features,
                                       X_min[ty],
                                       X_max[ty],
                                       feature_range=(0.01, 0.99))

    # Predict acoustic features
    # acoustic_model = acoustic_model.cpu()
    acoustic_model.eval()
    x = torch.FloatTensor(linguistic_features)
    acoustic_predicted = acoustic_model(x.unsqueeze(0)).data.numpy()
    print("acoustic_predicted shape: {}".format(acoustic_predicted.shape))

    # Apply denormalization
    acoustic_predicted = acoustic_predicted * Y_scale[ty] + Y_mean[ty]

    return gen_waveform(acoustic_predicted.squeeze(0), Y_var, post_filter)
Exemple #10
0
def tts_from_label(models, label_path, X_min, X_max, Y_mean, Y_std,
                   post_filter=False,
                   apply_duration_model=True, coef=1.4, fs=16000,
                   mge_training=True):
    duration_model, acoustic_model = models["duration"], models["acoustic"]

    if use_cuda:
        duration_model = duration_model.cuda()
        acoustic_model = acoustic_model.cuda()

    # Predict durations
    if apply_duration_model:
        duration_modified_hts_labels = gen_duration(
            label_path, duration_model, X_min, X_max, Y_mean, Y_std)
    else:
        duration_modified_hts_labels = hts.load(label_path)

    # Linguistic features
    linguistic_features = fe.linguistic_features(
        duration_modified_hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_acoustic.add_frame_features,
        subphone_features=hp_acoustic.subphone_features)
    # Trim silences
    indices = duration_modified_hts_labels.silence_frame_indices()
    linguistic_features = np.delete(linguistic_features, indices, axis=0)

    # Apply normalization
    ty = "acoustic"
    linguistic_features = P.minmax_scale(
        linguistic_features, X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Predict acoustic features
    acoustic_model.eval()
    x = Variable(torch.from_numpy(linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    acoustic_predicted = acoustic_model(x, [xl]).data.cpu().numpy()
    acoustic_predicted = acoustic_predicted.reshape(-1, acoustic_predicted.shape[-1])

    return gen_waveform(acoustic_predicted, Y_mean, Y_std, post_filter,
                        coef=coef, fs=fs, mge_training=mge_training)
Exemple #11
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict,
        continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(duration_linguistic_features,
                                                  X_min[ty],
                                                  X_max[ty],
                                                  feature_range=(0.01, 0.99))

    # Apply models
    duration_model = duration_model.cpu()
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    try:
        duration_predicted = duration_model(x).data.numpy()
    except:
        xl = len(x)
        x = x.view(1, -1, x.size(-1))
        duration_predicted = duration_model(x, [xl]).data.numpy()
        duration_predicted = duration_predicted.reshape(
            -1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = duration_predicted * Y_std[ty] + Y_mean[ty]
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Exemple #12
0
def gen_duration(label_path, duration_model, X_min, X_max, Y_mean, Y_std):
    # Linguistic features for duration
    hts_labels = hts.load(label_path)
    duration_linguistic_features = fe.linguistic_features(
        hts_labels,
        binary_dict, continuous_dict,
        add_frame_features=hp_duration.add_frame_features,
        subphone_features=hp_duration.subphone_features).astype(np.float32)

    # Apply normali--post-filterzation
    ty = "duration"
    duration_linguistic_features = P.minmax_scale(
        duration_linguistic_features,
        X_min[ty], X_max[ty], feature_range=(0.01, 0.99))

    # Apply models
    duration_model.eval()

    #  Apply model
    x = Variable(torch.from_numpy(duration_linguistic_features)).float()
    xl = len(x)
    x = x.view(1, -1, x.size(-1))
    x = _generator_input(hp_duration, x)
    x = x.cuda() if use_cuda else x
    duration_predicted = duration_model(x, [xl]).data.cpu().numpy()
    duration_predicted = duration_predicted.reshape(-1, duration_predicted.shape[-1])

    # Apply denormalization
    duration_predicted = P.inv_scale(duration_predicted, Y_mean[ty], Y_std[ty])
    duration_predicted = np.round(duration_predicted)

    # Set minimum state duration to 1
    #  print(duration_predicted)
    duration_predicted[duration_predicted <= 0] = 1
    hts_labels.set_durations(duration_predicted)

    return hts_labels
Exemple #13
0
 def __test_raise1(x, X_min, X_max):
     P.minmax_scale(x)
Exemple #14
0
def create_loader(test=False):
    DATA_ROOT = "./data/basic5000"
    X = {"acoustic": {}}
    Y = {"acoustic": {}}
    utt_lengths = {"acoustic": {}}
    for ty in ["acoustic"]:
        for phase in ["train", "test"]:
            train = phase == "train"
            x_dim = (duration_linguistic_dim
                     if ty == "duration" else acoustic_linguisic_dim)
            y_dim = duration_dim if ty == "duration" else acoustic_dim
            X[ty][phase] = FileSourceDataset(
                BinaryFileSource(join(DATA_ROOT, "X_{}".format(ty)),
                                 dim=x_dim,
                                 train=train))
            Y[ty][phase] = FileSourceDataset(
                BinaryFileSource(join(DATA_ROOT, "Y_{}".format(ty)),
                                 dim=y_dim,
                                 train=train))
            utt_lengths[ty][phase] = np.array([len(x) for x in X[ty][phase]],
                                              dtype=np.int)

    X_min = {}
    X_max = {}
    Y_mean = {}
    Y_var = {}
    Y_scale = {}

    for typ in ["acoustic"]:
        X_min[typ], X_max[typ] = minmax(X[typ]["train"],
                                        utt_lengths[typ]["train"])
        Y_mean[typ], Y_var[typ] = meanvar(Y[typ]["train"],
                                          utt_lengths[typ]["train"])
        Y_scale[typ] = np.sqrt(Y_var[typ])

    mora_index_lists = sorted(
        glob(join("data/basic5000/mora_index", "squeezed_*.csv")))
    mora_index_lists_for_model = [
        np.loadtxt(path).reshape(-1) for path in mora_index_lists
    ]

    train_mora_index_lists = []
    test_mora_index_lists = []
    test_not_valid = []

    for i, mora_i in enumerate(mora_index_lists_for_model):
        if (i - 1) % 20 == 0:  # test
            if test:
                test_not_valid.append(i)
            else:
                pass
        elif i % 20 == 0:  # valid
            test_mora_index_lists.append(mora_i)
        else:
            train_mora_index_lists.append(mora_i)

    X_acoustic_train = [
        minmax_scale(
            X["acoustic"]["train"][i],
            X_min["acoustic"],
            X_max["acoustic"],
            feature_range=(0.01, 0.99),
        ) for i in range(len(X["acoustic"]["train"]))
    ]
    Y_acoustic_train = [y for y in Y["acoustic"]["train"]]
    # Y_acoustic_train = [scale(Y["acoustic"]["train"][i], Y_mean["acoustic"], Y_scale["acoustic"]) for i in range(len(Y["acoustic"]["train"]))]
    train_mora_index_lists = [
        train_mora_index_lists[i] for i in range(len(train_mora_index_lists))
    ]

    X_acoustic_test = [
        minmax_scale(
            X["acoustic"]["test"][i],
            X_min["acoustic"],
            X_max["acoustic"],
            feature_range=(0.01, 0.99),
        ) for i in range(len(X["acoustic"]["test"]))
    ]
    Y_acoustic_test = [y for y in Y["acoustic"]["test"]]
    # Y_acoustic_test = [scale(Y["acoustic"]["test"][i], Y_mean["acoustic"], Y_scale["acoustic"])for i in range(len(Y["acoustic"]["test"]))]
    test_mora_index_lists = [
        test_mora_index_lists[i] for i in range(len(test_mora_index_lists))
    ]

    train_loader = [[
        X_acoustic_train[i], Y_acoustic_train[i], train_mora_index_lists[i]
    ] for i in range(len(train_mora_index_lists))]
    test_loader = [[
        X_acoustic_test[i], Y_acoustic_test[i], test_mora_index_lists[i]
    ] for i in range(len(test_mora_index_lists))]

    if test:
        return train_loader, test_loader, test_not_valid_loader
    else:
        return train_loader, test_loader