Example #1
0
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir):

    np.random.seed(random_seed)

    num_epochs = 5000
    mini_batch_size = 1 # mini_batch_size = 1 is better
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 16000
    num_mcep = 24
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = 10
    lambda_identity = 5

    print('Preprocessing Data...')

    start_time = time.time()

    wavs_A = load_wavs(wav_dir = train_A_dir, sr = sampling_rate)
    wavs_B = load_wavs(wav_dir = train_B_dir, sr = sampling_rate)

    f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data(wavs = wavs_A, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)
    f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data(wavs = wavs_B, fs = sampling_rate, frame_period = frame_period, coded_dim = num_mcep)

    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    print('Log Pitch A')
    print('Mean: %f, Std: %f' %(log_f0s_mean_A, log_f0s_std_A))
    print('Log Pitch B')
    print('Mean: %f, Std: %f' %(log_f0s_mean_B, log_f0s_std_B))


    coded_sps_A_transposed = transpose_in_list(lst = coded_sps_A)
    coded_sps_B_transposed = transpose_in_list(lst = coded_sps_B)

    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform(coded_sps = coded_sps_A_transposed)
    print("Input data fixed.")
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform(coded_sps = coded_sps_B_transposed)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'), mean_A = log_f0s_mean_A, std_A = log_f0s_std_A, mean_B = log_f0s_mean_B, std_B = log_f0s_std_B)
    np.savez(os.path.join(model_dir, 'mcep_normalization.npz'), mean_A = coded_sps_A_mean, std_A = coded_sps_A_std, mean_B = coded_sps_B_mean, std_B = coded_sps_B_std)

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time

    print('Preprocessing Done.')

    print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, (time_elapsed % 3600 // 60), (time_elapsed % 60 // 1)))

    model = CycleGAN(num_features = num_mcep)

    for epoch in range(num_epochs):
        print('Epoch: %d' % epoch)
        '''
        if epoch > 60:
            lambda_identity = 0
        if epoch > 1250:
            generator_learning_rate = max(0, generator_learning_rate - 0.0000002)
            discriminator_learning_rate = max(0, discriminator_learning_rate - 0.0000001)
        '''

        start_time_epoch = time.time()

        dataset_A, dataset_B = sample_train_data(dataset_A = coded_sps_A_norm, dataset_B = coded_sps_B_norm, n_frames = n_frames)

        n_samples = dataset_A.shape[0]

        for i in range(n_samples // mini_batch_size):

            num_iterations = n_samples // mini_batch_size * epoch + i

            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(0, discriminator_learning_rate - discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss = model.train(input_A = dataset_A[start:end], input_B = dataset_B[start:end], lambda_cycle = lambda_cycle, lambda_identity = lambda_identity, generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate)

            if i % 50 == 0:
                #print('Iteration: %d, Generator Loss : %f, Discriminator Loss : %f' % (num_iterations, generator_loss, discriminator_loss))
                print('Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'.format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss))

        model.save(directory = model_dir, filename = model_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1)))

        if validation_A_dir is not None:
            if epoch % 50 == 0:
                print('Generating Validation Data B from A...')
                for file in os.listdir(validation_A_dir):
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
                    wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
                    f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
                    f0_converted = pitch_conversion(f0 = f0, mean_log_src = log_f0s_mean_A, std_log_src = log_f0s_std_A, mean_log_target = log_f0s_mean_B, std_log_target = log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = 'A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
                    wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
                    librosa.output.write_wav(os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)

        if validation_B_dir is not None:
            if epoch % 50 == 0:
                print('Generating Validation Data A from B...')
                for file in os.listdir(validation_B_dir):
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True)
                    wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4)
                    f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period)
                    f0_converted = pitch_conversion(f0 = f0, mean_log_src = log_f0s_mean_B, std_log_src = log_f0s_std_B, mean_log_target = log_f0s_mean_A, std_log_target = log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = 'B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate)
                    wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period)
                    librosa.output.write_wav(os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
Example #2
0
def train(img_A_dir, img_B_dir, model_dir, model_name, random_seed,
          batch_size_maximum, validation_A_dir, validation_B_dir, output_dir,
          tensorboard_log_dir):

    np.random.seed(random_seed)

    num_epochs = 1000
    mini_batch_size = 1  # mini_batch_size = 1 is better
    learning_rate = 0.0002
    input_size = [256, 256, 3]
    num_filters = 64  # Tried num_filters = 8 still not good for 200 epochs

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    model = CycleGAN(input_size=input_size,
                     num_filters=num_filters,
                     mode='train',
                     log_dir=tensorboard_log_dir)

    dataset_A_raw = load_data(img_dir=img_A_dir, load_size=256)
    dataset_B_raw = load_data(img_dir=img_B_dir, load_size=256)

    for epoch in range(num_epochs):
        print('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        dataset_A, dataset_B = sample_train_data(
            dataset_A_raw,
            dataset_B_raw,
            load_size=286,
            output_size=256,
            batch_size_maximum=batch_size_maximum)

        n_samples = dataset_A.shape[0]
        for i in range(n_samples // mini_batch_size):

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss = model.train(
                input_A=dataset_A[start:end],
                input_B=dataset_B[start:end],
                learning_rate=learning_rate)

            if i % 50 == 0:
                print(
                    'Minibatch: %d, Generator Loss : %f, Discriminator Loss : %f'
                    % (i, generator_loss, discriminator_loss))

        model.save(directory=model_dir, filename=model_name)

        if validation_A_dir is not None:
            for file in os.listdir(validation_A_dir):
                filepath = os.path.join(validation_A_dir, file)
                img = cv2.imread(filepath)
                img_height, img_width, img_channel = img.shape
                img = cv2.resize(img, (input_size[1], input_size[0]))
                img = image_scaling(imgs=img)
                img_converted = model.test(inputs=np.array([img]),
                                           direction='A2B')[0]
                img_converted = image_scaling_inverse(imgs=img_converted)
                img_converted = cv2.resize(img_converted,
                                           (img_width, img_height))
                cv2.imwrite(
                    os.path.join(validation_A_output_dir,
                                 os.path.basename(file)), img_converted)

        if validation_B_dir is not None:
            for file in os.listdir(validation_B_dir):
                filepath = os.path.join(validation_B_dir, file)
                img = cv2.imread(filepath)
                img_height, img_width, img_channel = img.shape
                img = cv2.resize(img, (input_size[1], input_size[0]))
                img = image_scaling(imgs=img)
                img_converted = model.test(inputs=np.array([img]),
                                           direction='B2A')[0]
                img_converted = image_scaling_inverse(imgs=img_converted)
                img_converted = cv2.resize(img_converted,
                                           (img_width, img_height))
                cv2.imwrite(
                    os.path.join(validation_B_output_dir,
                                 os.path.basename(file)), img_converted)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name,
          random_seed, validation_A_dir, validation_B_dir, output_dir):

    np.random.seed(random_seed)

    num_epochs = 2000
    mini_batch_size = 1
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 16000
    num_mcep = 24
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = 10
    lambda_identity = 5

    # ****************************************************************
    # *************************Loading DATA***************************
    # ****************************************************************
    with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa:
        coded_sps_A_norm = pickle.load(fa)

    with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb:
        coded_sps_B_norm = pickle.load(fb)

    mcep_normalization_params = np.load(
        os.path.join(training_data_dir, 'mcep_normalization.npz'))
    coded_sps_A_mean = mcep_normalization_params['mean_A']
    coded_sps_A_std = mcep_normalization_params['std_A']
    coded_sps_B_mean = mcep_normalization_params['mean_B']
    coded_sps_B_std = mcep_normalization_params['std_B']

    logf0s_normalization_params = np.load(
        os.path.join(training_data_dir, 'logf0s_normalization.npz'))
    log_f0s_mean_A = logf0s_normalization_params['mean_A']
    log_f0s_std_A = logf0s_normalization_params['std_A']
    log_f0s_mean_B = logf0s_normalization_params['mean_B']
    log_f0s_std_B = logf0s_normalization_params['std_B']

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    print("****************************************************************")
    print("*************************Start Training*************************")
    print("****************************************************************")

    model = CycleGAN(num_features=num_mcep)

    epoch = 0
    # load model
    if os.path.exists(os.path.join(model_dir, "checkpoint")) == True:
        f = open(os.path.join(model_dir, "checkpoint"), "r")
        all_ckpt = f.readlines()
        f.close()
        pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1]
        epoch = int(pretrain_ckpt.split("-")[1].split(".")[0])
        if os.path.exists(os.path.join(model_dir,
                                       (pretrain_ckpt + ".index"))) == True:
            model.load(filepath=os.path.join(model_dir, pretrain_ckpt))
            print("Loading pretrained model {}".format(pretrain_ckpt))
    else:
        print("Training model from 0 epoch")

    for k in range(epoch + 1, num_epochs):
        print('Epoch: %d' % k)

        start_time_epoch = time.time()
        pool_A, pool_B = list(coded_sps_A_norm), list(coded_sps_B_norm)
        dataset_A, dataset_B = sample_train_data(dataset_A=pool_A,
                                                 dataset_B=pool_B,
                                                 n_frames=n_frames)
        print('dataset_A', np.shape(dataset_A), 'dataset_B',
              np.shape(dataset_B))
        n_samples = dataset_A.shape[0]

        for i in trange(n_samples // mini_batch_size):
            num_iterations = n_samples // mini_batch_size * epoch + i
            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(
                    0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(
                    0, discriminator_learning_rate -
                    discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss = model.train(
                input_A=dataset_A[start:end],
                input_B=dataset_B[start:end],
                lambda_cycle=lambda_cycle,
                lambda_identity=lambda_identity,
                generator_learning_rate=generator_learning_rate,
                discriminator_learning_rate=discriminator_learning_rate)

            if i % (n_samples // 2) == 0:
                print(
                    'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                    .format(num_iterations, generator_learning_rate,
                            discriminator_learning_rate, generator_loss,
                            discriminator_loss))

        if k == 1 or k % 100 == 0:
            print("Saving Epoch {}".format(k))
            ckpt_name = model_name + "-" + str(k) + ".ckpt"
            model.save(directory=model_dir, filename=ckpt_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))

        if validation_A_dir is not None:
            if k % 300 == 0:
                print('Generating Validation Data B from A...')
                for i in trange(len(os.listdir(validation_A_dir))):
                    file = os.listdir(validation_A_dir)[i]
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)

        if validation_B_dir is not None:
            if k % 300 == 0:
                print('Generating Validation Data A from B...')
                for i in trange(len(os.listdir(validation_B_dir))):
                    file = os.listdir(validation_A_dir)[i]
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_B,
                        std_log_src=log_f0s_std_B,
                        mean_log_target=log_f0s_mean_A,
                        std_log_target=log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_B_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
Example #4
0
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed,
          validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir,
          gen_model, MCEPs_dim, lambda_list, processed_data_dir):

    gen_loss_thres = 100.0
    np.random.seed(random_seed)
    num_epochs = 5000
    mini_batch_size = 1
    generator_learning_rate = 0.0002
    generator_learning_rate_decay = generator_learning_rate / 200000
    discriminator_learning_rate = 0.0001
    discriminator_learning_rate_decay = discriminator_learning_rate / 200000
    sampling_rate = 44000
    num_mcep = MCEPs_dim
    frame_period = 5.0
    n_frames = 128
    lambda_cycle = lambda_list[0]
    lambda_identity = lambda_list[1]

    Speaker_A_features = os.path.join(processed_data_dir, 'wav_A.npz')
    Speaker_B_features = os.path.join(processed_data_dir, 'wav_B.npz')
    start_time = time.time()
    print('lookiong for preprocessed data in:{}'.format(processed_data_dir))
    if os.path.exists(Speaker_A_features) and os.path.exists(
            Speaker_B_features):
        print('#### loading processed data #######')
        f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = load_speaker_features(
            Speaker_A_features)
        f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = load_speaker_features(
            Speaker_B_features)
    else:
        print('Preprocessing Data...')
        if not os.path.exists(processed_data_dir):
            os.makedirs(processed_data_dir)

        wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate)

        f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data(
            wavs=wavs_A,
            fs=sampling_rate,
            frame_period=frame_period,
            coded_dim=num_mcep)
        np.savez(Speaker_A_features,
                 f0s=f0s_A,
                 timeaxes=timeaxes_A,
                 sps=sps_A,
                 aps=aps_A,
                 coded_sps=coded_sps_A)

        del wavs_A

        wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate)

        f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data(
            wavs=wavs_B,
            fs=sampling_rate,
            frame_period=frame_period,
            coded_dim=num_mcep)
        np.savez(Speaker_B_features,
                 f0s=f0s_B,
                 timeaxes=timeaxes_B,
                 sps=sps_B,
                 aps=aps_B,
                 coded_sps=coded_sps_B)

        del wavs_B

        print('Data preprocessing finished !')
        return

    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    print('Log Pitch A')
    print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A))
    print('Log Pitch B')
    print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B))

    coded_sps_A, f0s_A = remove_radical_pitch_samples(f0s_A, coded_sps_A,
                                                      log_f0s_mean_A,
                                                      log_f0s_std_A)
    coded_sps_B, f0s_B = remove_radical_pitch_samples(f0s_B, coded_sps_B,
                                                      log_f0s_mean_B,
                                                      log_f0s_std_B)

    print('recalculating mean and std of radical cleared f0s')
    log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A)
    log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B)

    coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A)
    coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B)

    print("Input data fixed.")
    coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_A_transposed)
    coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform(
        coded_sps=coded_sps_B_transposed)

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'),
             mean_A=log_f0s_mean_A,
             std_A=log_f0s_std_A,
             mean_B=log_f0s_mean_B,
             std_B=log_f0s_std_B)
    np.savez(os.path.join(model_dir, 'mcep_normalization.npz'),
             mean_A=coded_sps_A_mean,
             std_A=coded_sps_A_std,
             mean_B=coded_sps_B_mean,
             std_B=coded_sps_B_std)

    if validation_A_dir is not None:
        validation_A_output_dir = os.path.join(output_dir, 'converted_A')
        if not os.path.exists(validation_A_output_dir):
            os.makedirs(validation_A_output_dir)

    if validation_B_dir is not None:
        validation_B_output_dir = os.path.join(output_dir, 'converted_B')
        if not os.path.exists(validation_B_output_dir):
            os.makedirs(validation_B_output_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time
    print('Preprocessing Done.')
    print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' %
          (time_elapsed // 3600, (time_elapsed % 3600 // 60),
           (time_elapsed % 60 // 1)))
    # ---------------------------------------------- Data preprocessing ---------------------------------------------- #

    # Model define
    model = CycleGAN(num_features=num_mcep,
                     log_dir=tensorboard_log_dir,
                     model_name=model_name,
                     gen_model=gen_model)
    # load model
    if os.path.exists(os.path.join(model_dir,
                                   (model_name + ".index"))) == True:
        model.load(filepath=os.path.join(model_dir, model_name))

    # =================================================== Training =================================================== #
    for epoch in range(num_epochs):
        print('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm,
                                                 dataset_B=coded_sps_B_norm,
                                                 n_frames=n_frames)

        n_samples = dataset_A.shape[0]
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        for i in tqdm.tqdm(range(n_samples // mini_batch_size)):

            num_iterations = n_samples // mini_batch_size * epoch + i

            if num_iterations > 10000:
                lambda_identity = 0
            if num_iterations > 200000:
                generator_learning_rate = max(
                    0, generator_learning_rate - generator_learning_rate_decay)
                discriminator_learning_rate = max(
                    0, discriminator_learning_rate -
                    discriminator_learning_rate_decay)

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss, generator_loss_A2B = model.train\
                (input_A = dataset_A[start:end], input_B = dataset_B[start:end],
                 lambda_cycle = lambda_cycle, lambda_identity = lambda_identity,
                 generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate)
            # issue #4,
            #            model.summary()

            # Minimum AtoB loss model save
            # if gen_loss_thres > generator_loss_A2B:
            #     gen_loss_thres = generator_loss_A2B
            #     best_model_name = 'Bestmodel' + model_name
            #     model.save(directory=model_dir, filename=best_model_name)
            #     print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B)

            if i % 50 == 0:
                print(
                    'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}'
                    .format(num_iterations, generator_learning_rate,
                            discriminator_learning_rate, generator_loss,
                            discriminator_loss))

        # Last model save
        if epoch % 10 == 0:
            model.save(directory=model_dir, filename=model_name)

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        print('Time Elapsed for This Epoch: %02d:%02d:%02d' %
              (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60),
               (time_elapsed_epoch % 60 // 1)))
        # -------------------------------------------- one epoch learning -------------------------------------------- #
        # ------------------------------------------- validation inference ------------------------------------------- #
        if validation_A_dir is not None:
            # if epoch % 50 == 0:
            if epoch % 10 == 0:
                print('Generating Validation Data B from A...')
                for file in os.listdir(validation_A_dir):
                    filepath = os.path.join(validation_A_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_A,
                        std_log_src=log_f0s_std_A,
                        mean_log_target=log_f0s_mean_B,
                        std_log_target=log_f0s_std_B)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_A_mean) / coded_sps_A_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='A2B')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_A_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
                    # break

        if validation_B_dir is not None:
            # if epoch % 50 == 0:
            if epoch % 10 == 0:
                print('Generating Validation Data A from B...')
                for file in os.listdir(validation_B_dir):
                    filepath = os.path.join(validation_B_dir, file)
                    wav, _ = librosa.load(filepath,
                                          sr=sampling_rate,
                                          mono=True)
                    wav = wav_padding(wav=wav,
                                      sr=sampling_rate,
                                      frame_period=frame_period,
                                      multiple=4)
                    f0, timeaxis, sp, ap = world_decompose(
                        wav=wav, fs=sampling_rate, frame_period=frame_period)
                    f0_converted = pitch_conversion(
                        f0=f0,
                        mean_log_src=log_f0s_mean_B,
                        std_log_src=log_f0s_std_B,
                        mean_log_target=log_f0s_mean_A,
                        std_log_target=log_f0s_std_A)
                    coded_sp = world_encode_spectral_envelop(sp=sp,
                                                             fs=sampling_rate,
                                                             dim=num_mcep)
                    coded_sp_transposed = coded_sp.T
                    coded_sp_norm = (coded_sp_transposed -
                                     coded_sps_B_mean) / coded_sps_B_std
                    coded_sp_converted_norm = model.test(inputs=np.array(
                        [coded_sp_norm]),
                                                         direction='B2A')[0]
                    coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean
                    coded_sp_converted = coded_sp_converted.T
                    coded_sp_converted = np.ascontiguousarray(
                        coded_sp_converted)
                    decoded_sp_converted = world_decode_spectral_envelop(
                        coded_sp=coded_sp_converted, fs=sampling_rate)
                    wav_transformed = world_speech_synthesis(
                        f0=f0_converted,
                        decoded_sp=decoded_sp_converted,
                        ap=ap,
                        fs=sampling_rate,
                        frame_period=frame_period)
                    librosa.output.write_wav(
                        os.path.join(validation_B_output_dir,
                                     os.path.basename(file)), wav_transformed,
                        sampling_rate)
Example #5
0
def train(train_dir, model_dir, model_name, random_seed, \
            validation_dir, output_dir, pre_train=None, \
            lambda_cycle=0, lambda_momenta=0):

    np.random.seed(random_seed)

    num_epochs = 500
    mini_batch_size = 1

    generator_learning_rate = 0.0001
    discriminator_learning_rate = 0.0000001

    num_mcep = 23
    n_frames = 128

    lambda_cycle = lambda_cycle
    lambda_momenta = lambda_momenta

    lc_lm = "lc_"+str(lambda_cycle) \
            +"_lm_"+str(lambda_momenta)

    logger_file = './log/' + lc_lm + '.log'

    if not os.path.exists('./log'):
        os.mkdir('./log')

    if os.path.exists(logger_file):
        os.remove(logger_file)

    logging.basicConfig(filename=logger_file, \
                            level=logging.DEBUG)

    logging.info("lambda_cycle - {}".format(lambda_cycle))
    logging.info("lambda_momenta - {}".format(lambda_momenta))

    if not os.path.isdir("./generated_pitch/" + lc_lm):
        os.mkdir("./generated_pitch/" + lc_lm)
    else:
        for f in glob(os.path.join("./generated_pitch/", "*.png")):
            os.remove(f)

    start_time = time.time()

    data_train = scio.loadmat(os.path.join(train_dir, 'train.mat'))
    data_valid = scio.loadmat(os.path.join(train_dir, 'valid.mat'))

    pitch_A_train = np.expand_dims(data_train['src_f0_feat'], axis=-1)
    pitch_B_train = np.expand_dims(data_train['tar_f0_feat'], axis=-1)
    mfc_A_train = data_train['src_mfc_feat']
    mfc_B_train = data_train['tar_mfc_feat']

    pitch_A_valid = np.expand_dims(data_valid['src_f0_feat'], axis=-1)
    pitch_B_valid = np.expand_dims(data_valid['tar_f0_feat'], axis=-1)
    mfc_A_valid = data_valid['src_mfc_feat']
    mfc_B_valid = data_valid['tar_mfc_feat']

    # Shuffle to get non-parallel training data
    indices_train = np.arange(0, pitch_A_train.shape[0])
    np.random.shuffle(indices_train)
    pitch_A_train = pitch_A_train[indices_train]
    mfc_A_train = mfc_A_train[indices_train]
    np.random.shuffle(indices_train)
    pitch_B_train = pitch_B_train[indices_train]
    mfc_B_train = mfc_B_train[indices_train]

    mfc_A_valid, pitch_A_valid, \
        mfc_B_valid, pitch_B_valid = preproc.sample_data(mfc_A=mfc_A_valid, \
                                    mfc_B=mfc_B_valid, pitch_A=pitch_A_valid, \
                                    pitch_B=pitch_B_valid)

    if validation_dir is not None:
        validation_output_dir = os.path.join(output_dir, lc_lm)
        if not os.path.exists(validation_output_dir):
            os.makedirs(validation_output_dir)

    end_time = time.time()
    time_elapsed = end_time - start_time

    print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, \
                                                                   (time_elapsed % 3600 // 60), \
                                                                   (time_elapsed % 60 // 1)))

    #use pre_train arg to provide trained model
    model = CycleGAN(dim_pitch=1, dim_mfc=num_mcep, \
                n_frames=n_frames, pre_train=pre_train)

    for epoch in range(1, num_epochs + 1):

        print('Epoch: %d' % epoch)
        logging.info('Epoch: %d' % epoch)

        start_time_epoch = time.time()

        mfc_A, pitch_A, \
            mfc_B, pitch_B = preproc.sample_data(mfc_A=mfc_A_train, \
                            mfc_B=mfc_B_train, pitch_A=pitch_A_train, \
                            pitch_B=pitch_B_train)

        n_samples = mfc_A.shape[0]

        train_gen_loss = []
        train_disc_loss = []

        for i in range(n_samples // mini_batch_size):

            start = i * mini_batch_size
            end = (i + 1) * mini_batch_size

            generator_loss, discriminator_loss, \
            gen_A, gen_B, \
            mom_A, mom_B = model.train(mfc_A=mfc_A[start:end], \
                                        mfc_B=mfc_B[start:end], \
                                        pitch_A=pitch_A[start:end], \
                                        pitch_B=pitch_B[start:end], \
                                        lambda_cycle=lambda_cycle, \
                                        lambda_momenta=lambda_momenta, \
                                        generator_learning_rate=generator_learning_rate, \
                                        discriminator_learning_rate=discriminator_learning_rate)

            train_gen_loss.append(generator_loss)
            train_disc_loss.append(discriminator_loss)

        logging.info("Train Generator Loss- {}".format(
            np.mean(train_gen_loss)))
        logging.info("Train Discriminator Loss- {}".format(
            np.mean(train_disc_loss)))

        if epoch % 100 == 0:

            for i in range(mfc_A_valid.shape[0]):

                gen_A, gen_B, mom_A, mom_B \
                        = model.test_gen(mfc_A=mfc_A_valid[i:i+1], \
                                mfc_B=mfc_B_valid[i:i+1], \
                                pitch_A=pitch_A_valid[i:i+1], \
                                pitch_B=pitch_B_valid[i:i+1])

                pylab.figure(figsize=(12, 12))
                pylab.subplot(121)
                pylab.plot(pitch_A_valid[i].reshape(-1, ), label='Input A')
                pylab.plot(gen_B.reshape(-1, ), label='Generated B')
                pylab.plot(mom_B.reshape(-1, ), label='Generated momenta')
                pylab.legend(loc=2)

                pylab.subplot(122)
                pylab.plot(pitch_B_valid[i].reshape(-1, ), label='Input B')
                pylab.plot(gen_A.reshape(-1, ), label='Generated A')
                pylab.plot(mom_A.reshape(-1, ), label='Generated momenta')
                pylab.legend(loc=2)

                pylab.title('Epoch ' + str(epoch) + ' example ' + str(i + 1))
                pylab.savefig('./generated_pitch/' + str(epoch) + '_' +
                              str(i + 1) + '.png')
                pylab.close()

        end_time_epoch = time.time()
        time_elapsed_epoch = end_time_epoch - start_time_epoch

        logging.info('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, \
                (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1)))

        if epoch % 100 == 0:
            cur_model_name = model_name + "_" + str(epoch) + ".ckpt"
            model.save(directory=model_dir, filename=cur_model_name)