def conversion(training_data_dir, model_dir, model_name, data_dir, conversion_direction, output_dir, pc):

    num_features = 24
    sampling_rate = 16000
    frame_period = 5.0

    model = CycleGAN2(num_features = num_features, mode = 'test')

    if os.path.exists(os.path.join(model_dir, "checkpoint")) == True:
        f = open(os.path.join(model_dir, "checkpoint"),"r")
        all_ckpt = f.readlines()
        f.close()
        pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1]
        assert os.path.exists(os.path.join(model_dir, (pretrain_ckpt+".index"))) == True, "The checkpoint is not exist."
        model.load(filepath=os.path.join(model_dir, pretrain_ckpt))
        print("Loading pretrained model {}".format(pretrain_ckpt))


    mcep_normalization_params = np.load(os.path.join(training_data_dir, 'mcep_normalization.npz'))
    mcep_mean_A = mcep_normalization_params['mean_A']
    mcep_std_A = mcep_normalization_params['std_A']
    mcep_mean_B = mcep_normalization_params['mean_B']
    mcep_std_B = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(os.path.join(training_data_dir, 'logf0s_normalization.npz'))
    logf0s_mean_A = logf0s_normalization_params['mean_A']
    logf0s_std_A = logf0s_normalization_params['std_A']
    logf0s_mean_B = logf0s_normalization_params['mean_B']
    logf0s_std_B = logf0s_normalization_params['std_B']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i in trange(len(os.listdir(data_dir))):
        file = os.listdir(data_dir)[i]
        filepath = os.path.join(data_dir, file)
        wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
        f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
        coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features)
        coded_sp_transposed = coded_sp.T

        frame_size = 128
        if conversion_direction == 'A2B':
            # pitch
            print("AtoB")
            if pc == True:
                print("pitch convert")
                f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A,
                 mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
            else:
                print("pitch same")
                f0_converted = f0

            # normalization A Domain
            coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A

            # padding
            remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False
            if coded_sp_norm.shape[1] % frame_size != 0:
                coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((24, remain))), axis=1)
                padd = True

            # inference for segmentation
            coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0]
            for i in range(1, coded_sp_norm.shape[1] // frame_size):
                ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
                                  direction=conversion_direction)[0]
                coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1)

            if padd == True:
                coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B
        else:
            print("BtoA")
            if pc == True:
                print("pitch convert")
                f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A,
                mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B)
            else:
                f0_converted = f0

            # normalization B Domain
            coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B

            # padding
            remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False
            if coded_sp_norm.shape[1] % frame_size != 0:
                coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((24, remain))), axis=1)
                padd = True

            # inference for segmentation
            coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0]
            for i in range(1, coded_sp_norm.shape[1] // frame_size):
                ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]),
                                  direction=conversion_direction)[0]
                coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1)

            if padd == True:
                coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain]
            coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A

        # output translation value processing
        coded_sp_converted = coded_sp_converted.T
        coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
        decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)

        # World vocoder synthesis
        wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
        librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
Beispiel #2
0
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name,
          random_seed, validation_A_dir, validation_B_dir, output_dir,
          tensorboard_log_dir, MCEPs_dim, lambda_list):

    gen_loss_thres = 100.0
    np.random.seed(random_seed)
    num_epochs = 2000
    mini_batch_size = 1
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 16000
    num_mcep = MCEPs_dim
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = lambda_list[0]
    lambda_identity = lambda_list[1]

    # ****************************************************************
    # *************************Loading DATA***************************
    # ****************************************************************
    with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa:
        coded_sps_A_norm = pickle.load(fa)

    with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb:
        coded_sps_B_norm = pickle.load(fb)

    mcep_normalization_params = np.load(
        os.path.join(training_data_dir, 'mcep_normalization.npz'))
    coded_sps_A_mean = mcep_normalization_params['mean_A']
    coded_sps_A_std = mcep_normalization_params['std_A']
    coded_sps_B_mean = mcep_normalization_params['mean_B']
    coded_sps_B_std = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(
        os.path.join(training_data_dir, 'logf0s_normalization.npz'))
    log_f0s_mean_A = logf0s_normalization_params['mean_A']
    log_f0s_std_A = logf0s_normalization_params['std_A']
    log_f0s_mean_B = logf0s_normalization_params['mean_B']
    log_f0s_std_B = logf0s_normalization_params['std_B']

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    print("****************************************************************")
    print("*************************Start Training*************************")
    print("****************************************************************")

    # Model define
    model = CycleGAN2(num_features=num_mcep,
                      log_dir=tensorboard_log_dir,
                      model_name=model_name)

    epoch = 0
    # load model
    if os.path.exists(os.path.join(model_dir, "checkpoint")) == True:
        f = open(os.path.join(model_dir, "checkpoint"), "r")
        all_ckpt = f.readlines()
        f.close()
        pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1]
        epoch = int(pretrain_ckpt.split("-")[1].split(".")[0])
        if os.path.exists(os.path.join(model_dir,
                                       (pretrain_ckpt + ".index"))) == True:
            model.load(filepath=os.path.join(model_dir, pretrain_ckpt))
            print("Loading pretrained model {}".format(pretrain_ckpt))
    else:
        print("Training model from 1 epoch")

    for k in range(epoch + 1, num_epochs):
        print('Epoch: %d' % k)

        start_time_epoch = time.time()

        dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm,
                                                 dataset_B=coded_sps_B_norm,
                                                 n_frames=n_frames)

        n_samples = dataset_A.shape[0]
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        for i in trange(n_samples // mini_batch_size):

            num_iterations = n_samples // mini_batch_size * epoch + i

            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(
                    0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(
                    0, discriminator_learning_rate -
                    discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss, generator_loss_A2B = model.train\
                (input_A = dataset_A[start:end], input_B = dataset_B[start:end],
                 lambda_cycle = lambda_cycle, lambda_identity = lambda_identity,
                 generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate)
            #model.summary()

            # # Minimum AtoB loss model save
            # if gen_loss_thres > generator_loss_A2B:
            #     gen_loss_thres = generator_loss_A2B
            #     best_model_name = 'Bestmodel' + model_name
            #     model.save(directory=model_dir, filename=best_model_name)
            #     print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B)

            if i % (n_samples // 2) == 0:
                print(
                    'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                    .format(num_iterations, generator_learning_rate,
                            discriminator_learning_rate, generator_loss,
                            discriminator_loss))

        # Last model save
        if k == 1 or k % 100 == 0:
            print("Saving Epoch {}".format(k))
            ckpt_name = model_name + "-" + str(k) + ".ckpt"
            model.save(directory=model_dir, filename=ckpt_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        # ------------------------------------------- validation inference ------------------------------------------- #
        if validation_A_dir is not None:
            if k % 500 == 0:
                print('Generating Validation Data B from A...')
                for i in trange(len(os.listdir(validation_A_dir))):
                    file = os.listdir(validation_A_dir)[i]
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
                    # break

        if validation_B_dir is not None:
            if k % 500 == 0:
                print('Generating Validation Data A from B...')
                for i in trange(len(os.listdir(validation_B_dir))):
                    file = os.listdir(validation_B_dir)[i]
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_B,
                        std_log_src=log_f0s_std_B,
                        mean_log_target=log_f0s_mean_A,
                        std_log_target=log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_B_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)

def seg_and_pad(src, n_frames):
    n_origin = src.shape[1]
    n_padded = (n_origin // n_frames + 1) * n_frames
    left_pad = (n_padded - n_origin) // 2
    right_pad = n_padded - n_origin - left_pad
    src = np.pad(src, [(0, 0), (left_pad, right_pad)],
                 'constant',
                 constant_values=0)
    src = np.reshape(src, [-1, hp.num_mceps, n_frames])

    return src


model = CycleGAN2()
latest = tf.train.latest_checkpoint(hp.weights_dir)
model.load_weights(latest)

print('Loading cached data...')
with open('./datasets/JSUT/jsut.p', 'rb') as f:
    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = pickle.load(
        f)

with open('./datasets/target_voice/target_voice.p', 'rb') as f:
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = pickle.load(
        f)

wav, _ = librosa.load('./outputs/100002.wav', sr=hp.rate)
f0, timeaxis, sp, ap = world_decompose(wav, hp.rate)
f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,