Beispiel #1
0
def get_tts_data_loaders(X, Y, X_data_min, X_data_max, Y_data_mean,
                         Y_data_std):
    X_train, X_test = X["train"], X["test"]
    Y_train, Y_test = Y["train"], Y["test"]

    # Sequence-wise train loader
    X_train_cache_dataset = MemoryCacheDataset(X_train,
                                               cache_size=hp.cache_size)
    Y_train_cache_dataset = MemoryCacheDataset(Y_train,
                                               cache_size=hp.cache_size)
    train_dataset = TTSDataset(X_train_cache_dataset, Y_train_cache_dataset,
                               X_data_min, X_data_max, Y_data_mean, Y_data_std)
    train_loader = data_utils.DataLoader(train_dataset,
                                         batch_size=hp.batch_size,
                                         num_workers=hp.num_workers,
                                         pin_memory=hp.pin_memory,
                                         shuffle=True,
                                         collate_fn=collate_fn)

    # Sequence-wise test loader
    X_test_cache_dataset = MemoryCacheDataset(X_test, cache_size=hp.cache_size)
    Y_test_cache_dataset = MemoryCacheDataset(Y_test, cache_size=hp.cache_size)
    test_dataset = TTSDataset(X_test_cache_dataset, Y_test_cache_dataset,
                              X_data_min, X_data_max, Y_data_mean, Y_data_std)
    test_loader = data_utils.DataLoader(test_dataset,
                                        batch_size=hp.batch_size,
                                        num_workers=hp.num_workers,
                                        pin_memory=hp.pin_memory,
                                        shuffle=False,
                                        collate_fn=collate_fn)

    dataset_loaders = {"train": train_loader, "test": test_loader}
    return dataset_loaders
Beispiel #2
0
def get_data_loaders(config):
    data_loaders = {}
    for phase in ["train_no_dev", "dev"]:
        in_dir = to_absolute_path(config.data[phase].in_dir)
        out_dir = to_absolute_path(config.data[phase].out_dir)
        train = phase.startswith("train")
        in_feats = FileSourceDataset(NpyFileSource(in_dir))
        out_feats = FileSourceDataset(NpyFileSource(out_dir))

        in_feats = MemoryCacheDataset(in_feats, cache_size=10000)
        out_feats = MemoryCacheDataset(out_feats, cache_size=10000)

        dataset = Dataset(in_feats, out_feats)
        data_loaders[phase] = data_utils.DataLoader(
            dataset,
            batch_size=config.data.batch_size,
            collate_fn=collate_fn,
            pin_memory=config.data.pin_memory,
            num_workers=config.data.num_workers,
            shuffle=train)

        for x, y, l in data_loaders[phase]:
            logger.info(f"{x.shape}, {y.shape}, {l.shape}")

    return data_loaders
def test_pack_sequnce():
    """Test minibatch RNN training using pack_pad_sequence.
    """

    X, Y = _get_small_datasets(padded=False)
    lengths = np.array([len(x) for x in X], dtype=np.int)[:, None]

    # We need padded dataset
    X, Y = _get_small_datasets(padded=True)

    # For the above reason, we need to explicitly give the number of frames.
    X = MemoryCacheDataset(X, cache_size=len(X))
    Y = MemoryCacheDataset(Y, cache_size=len(Y))

    in_dim = X[0].shape[-1]
    out_dim = Y[0].shape[-1]
    hidden_dim = 5
    model = MyRNN(in_dim, hidden_dim, out_dim, num_layers=2,
                  bidirectional=True)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    model.train()
    batch_size = 2

    dataset = PyTorchDataset(X, Y, lengths)
    loader = data_utils.DataLoader(
        dataset, batch_size=batch_size, num_workers=1, shuffle=True)

    # Test if trining loop pass with no errors. The following code was adapted
    # from practical RNN training demo.
    for idx, (x, y, lengths) in enumerate(loader):
        # Sort by lengths indices
        sorted_lengths, indices = torch.sort(lengths.view(-1), dim=0,
                                             descending=True)
        sorted_lengths = sorted_lengths.long().numpy()
        # Get sorted batch
        x, y = x[indices], y[indices]
        # Trim outputs with max length
        y = y[:, :sorted_lengths[0]]

        x = Variable(x)
        y = Variable(y)
        h, c = model.init_hidden(len(sorted_lengths))
        optimizer.zero_grad()

        y_hat = model(x, sorted_lengths, h, c)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
    def __init__(self,
                 data_root_dir=None,
                 train_mode=False,
                 output_mode='melspec',
                 transform=None,
                 data_sel=None):

        self.wav_root_dir = data_root_dir + '/wavs/'
        self.train_mode = train_mode
        self.output_mode = output_mode
        self.transform = transform
        self.data_sel = data_sel

        self.max_len_text = MAX_LEN_TEXT
        self.max_len_melspec = MAX_LEN_MELSPEC
        self.max_len_spec = MAX_LEN_SPEC

        #        self.max_len_paired_text = MAX_LEN_PAIRED_TEXT # Max-lengths are required for z-padding
        #        self.max_len_paired_spec = MAX_LEN_PAIRED_SPEC
        #        self.max_len_paired_melspec = MAX_LEN_PAIRED_MELSPEC

        # Preparing Text:
        self.text_csv_path = data_root_dir + '/metadata.csv'
        self.reduce_punc_table = str.maketrans(
            string.ascii_uppercase, string.ascii_lowercase,
            '0123456789!#"$%&\()*+/:;<=>?@[\\]^_`{|}~')
        self.chr2int_table = dict(
            zip(" ',-." + string.ascii_lowercase, np.arange(0, 31)))

        df = pd.read_csv(self.text_csv_path,
                         index_col=False,
                         sep='|',
                         header=None,
                         memory_map=True)  # memory_map: speed-up reading.
        nan_rows = df[df[2].isnull()].index.values
        df.iloc[nan_rows, 2] = df.iloc[nan_rows,
                                       1]  # fixing dataset NaN value bugs...
        df = df.drop(1, axis=1)
        df.columns = ['file_id', 'text']
        df = df.drop(OMIT_DATA_ROWS, axis=0).reset_index(
            drop=True)  # Omitting foreign language..

        if self.train_mode is True:
            self.file_ids = df.iloc[0:N_TRAIN,
                                    0]  # file_ids: LJ**-**** (13,000)
            self.texts = df.iloc[0:N_TRAIN, 1]
            if self.data_sel is not None:
                self.file_ids = self.file_ids[self.data_sel].reset_index(
                    drop=True)
                self.texts = self.texts[self.data_sel].reset_index(drop=True)
        else:
            self.file_ids = df.iloc[N_TRAIN:,
                                    0].reset_index(drop=True)  # (100)
            self.texts = df.iloc[N_TRAIN:, 1].reset_index(drop=True)
            if self.data_sel is not None:
                self.file_ids = self.file_ids[self.data_sel].reset_index(
                    drop=True)
                self.texts = self.texts[self.data_sel].reset_index(drop=True)

        # Prepraing Audio:
        if self.train_mode is True:
            self.spec_features = MemoryCacheDataset(FileSourceDataset(
                SpecSource(wav_data_root=self.wav_root_dir,
                           file_sel_range=[0, N_TRAIN],
                           output_mode=self.output_mode)),
                                                    cache_size=len(
                                                        self.file_ids))
        else:
            self.spec_features = MemoryCacheDataset(FileSourceDataset(
                SpecSource(wav_data_root=self.wav_root_dir,
                           file_sel_range=[N_TRAIN, None],
                           output_mode=self.output_mode)),
                                                    cache_size=len(
                                                        self.file_ids))
        assert (len(self.file_ids) == len(self.spec_features))

        #        # Pairing: Sort and divide by feature lengths, then concat small + large
        #        lengths       = np.load('mspec_length_train_13000.npy')
        #        sorted_by_len = np.argsort(lengths)
        #        n_org       = len(sorted_by_len)
        #        n_pairs     = int(n_org / 2)
        #        self.paired_items = list()
        #        for i in range(n_pairs):
        #            self.paired_items.append([sorted_by_len[i], sorted_by_len[n_org - 1 - i]])

        return None
Beispiel #5
0
lambda_cycle = 10
lambda_identity = 5
start_decay = 200000
adam_betas = (0.5, 0.999)
writer = SummaryWriter()
############## HYPERPARAMETER PART #######################################

# We create nice dirs

if not os.path.exists(validation_A_dir):
    os.mkdir(validation_A_dir)
if not os.path.exists(validation_B_dir):
    os.mkdir(validation_B_dir)
if not os.path.exists("figures"):
    os.mkdir("figures")
SF1_train_data_source = MemoryCacheDataset(
    FileSourceDataset((VCC2016DataSource(data_root, ["SF1"], training=True))))
TF2_train_data_source = MemoryCacheDataset(
    FileSourceDataset((VCC2016DataSource(data_root, ["TF2"], training=True))))
SF1_test_data_source = MemoryCacheDataset(
    FileSourceDataset((VCC2016DataSource(data_root, ["SF1"], training=False))))
TF2_test_data_source = MemoryCacheDataset(
    FileSourceDataset((VCC2016DataSource(data_root, ["TF2"], training=False))))

train_dataset = MCEPWrapper(SF1_train_data_source,
                            TF2_train_data_source,
                            mfcc_only=True)
test_dataset = MCEPWrapper(SF1_test_data_source,
                           TF2_test_data_source,
                           mfcc_only=False,
                           norm_calc=False)
test_dataset.input_meanstd = train_dataset.input_meanstd
Beispiel #6
0
def synth(file_path, domain_A, data_root, output_dir):
    sr = 16000

    model = Generator(24)

    if domain_A:
        generator_A2B = torch.load("checkpoint/generator_A2B.pt")
        model.load_state_dict(generator_A2B)
    else:
        generator_B2A = torch.load("checkpoint/generator_B2A.pt")
        model.load_state_dict(generator_B2A)

    filename_B = os.path.basename(file_path)
    SF1_train_data_source = MemoryCacheDataset(
        FileSourceDataset((VCC2016DataSource(data_root, ["SF1"],
                                             training=True))))
    TF2_train_data_source = MemoryCacheDataset(
        FileSourceDataset((VCC2016DataSource(data_root, ["TF2"],
                                             training=True))))
    SF1_test_data_source = MemoryCacheDataset(
        FileSourceDataset((VCC2016DataSource(data_root, ["SF1"],
                                             training=False))))
    TF2_test_data_source = MemoryCacheDataset(
        FileSourceDataset((VCC2016DataSource(data_root, ["TF2"],
                                             training=False))))

    train_dataset = MCEPWrapper(SF1_train_data_source,
                                TF2_train_data_source,
                                mfcc_only=True)
    test_dataset = MCEPWrapper(SF1_test_data_source,
                               TF2_test_data_source,
                               mfcc_only=False,
                               norm_calc=False)
    test_dataset.input_meanstd = train_dataset.input_meanstd
    test_dataset.output_meanstd = train_dataset.output_meanstd

    wav, _ = librosa.load(file_path, sr=sr, mono=True)
    wav_padded = wav_padding(wav, sr=sr, frame_period=5, multiple=4)
    f0, _, sp, ap = world_decompose(wav_padded, sr)

    mcep = world_encode_spectral_envelop(sp, sr)

    # Normalising MCEPs
    mean_A, std_A = train_dataset.input_meanstd
    mean_B, std_B = train_dataset.output_meanstd

    mean_f0_A = mean_A[0]
    mean_f0_B = mean_B[0]
    std_f0_A = std_A[0]
    std_f0_B = std_B[0]
    mean_mcep_A = mean_A[1:25]
    mean_mcep_B = mean_B[1:25]
    std_mcep_A = std_A[1:25]
    std_mcep_B = std_B[1:25]

    if domain_A:
        normalised_mcep_source = torch.Tensor(
            (mcep - mean_mcep_A) / std_mcep_A)
    else:
        normalised_mcep_source = torch.Tensor(
            (mcep - mean_mcep_B) / std_mcep_B)

    normalised_mcep_source = normalised_mcep_source[None, :, :]
    normalised_mcep_source = normalised_mcep_source.permute(0, 2, 1)

    normalised_mcep_target = model(normalised_mcep_source)
    normalised_mcep_target = normalised_mcep_target.permute(
        0, 2, 1).cpu().detach().numpy()[0, :, :]

    if domain_A:
        mcep_target = normalised_mcep_target * std_mcep_B + mean_mcep_B
    else:
        mcep_target = normalised_mcep_target * std_mcep_B + mean_mcep_B

    mcep_target = np.ascontiguousarray(mcep_target)
    # Because here we directly decompose from signal, the nonmasked array implementation is used
    f0_target = pitch_conversion(f0, mean_f0_A, std_f0_A, mean_f0_B, std_f0_B)

    sp_target = world_decode_spectral_envelop(mcep_target, sr)
    ap_target = np.ascontiguousarray(ap)

    speech_fake_A = world_speech_synthesis(f0_target,
                                           sp_target,
                                           ap_target,
                                           sr,
                                           frame_period=5)

    librosa.output.write_wav(os.path.join(output_dir, filename_B),
                             speech_fake_A, sr)
Beispiel #7
0
def get_data_loaders(data_config, collate_fn, logger):
    """Get data loaders for training and validation.

    Args:
        data_config (dict): Data configuration.
        collate_fn (callable): Collate function.
        logger (logging.Logger): Logger.

    Returns:
        dict: Data loaders.
    """
    if "filter_long_segments" not in data_config:
        logger.warning(
            "filter_long_segments is not found in the data config. Consider set it explicitly."
        )
        logger.info("Disable filtering for long segments.")
        filter_long_segments = False
    else:
        filter_long_segments = data_config.filter_long_segments

    if "filter_num_frames" not in data_config:
        logger.warning(
            "filter_num_frames is not found in the data config. Consider set it explicitly."
        )
        filter_num_frames = 6000
    else:
        filter_num_frames = data_config.filter_num_frames

    data_loaders = {}
    for phase in ["train_no_dev", "dev"]:
        in_dir = to_absolute_path(data_config[phase].in_dir)
        out_dir = to_absolute_path(data_config[phase].out_dir)
        train = phase.startswith("train")
        in_feats = FileSourceDataset(
            NpyFileSource(
                in_dir,
                logger,
                filter_long_segments=filter_long_segments,
                filter_num_frames=filter_num_frames,
            ))
        out_feats = FileSourceDataset(
            NpyFileSource(
                out_dir,
                logger,
                filter_long_segments=filter_long_segments,
                filter_num_frames=filter_num_frames,
            ))

        in_feats = MemoryCacheDataset(in_feats, cache_size=10000)
        out_feats = MemoryCacheDataset(out_feats, cache_size=10000)

        dataset = Dataset(in_feats, out_feats)
        data_loaders[phase] = data_utils.DataLoader(
            dataset,
            batch_size=data_config.batch_size,
            collate_fn=collate_fn,
            pin_memory=data_config.pin_memory,
            num_workers=data_config.num_workers,
            shuffle=train,
        )

    return data_loaders