def conversion(training_data_dir, model_dir, model_name, data_dir, conversion_direction, output_dir, pc): num_features = 24 sampling_rate = 16000 frame_period = 5.0 model = CycleGAN2(num_features = num_features, mode = 'test') if os.path.exists(os.path.join(model_dir, "checkpoint")) == True: f = open(os.path.join(model_dir, "checkpoint"),"r") all_ckpt = f.readlines() f.close() pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1] assert os.path.exists(os.path.join(model_dir, (pretrain_ckpt+".index"))) == True, "The checkpoint is not exist." model.load(filepath=os.path.join(model_dir, pretrain_ckpt)) print("Loading pretrained model {}".format(pretrain_ckpt)) mcep_normalization_params = np.load(os.path.join(training_data_dir, 'mcep_normalization.npz')) mcep_mean_A = mcep_normalization_params['mean_A'] mcep_std_A = mcep_normalization_params['std_A'] mcep_mean_B = mcep_normalization_params['mean_B'] mcep_std_B = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load(os.path.join(training_data_dir, 'logf0s_normalization.npz')) logf0s_mean_A = logf0s_normalization_params['mean_A'] logf0s_std_A = logf0s_normalization_params['std_A'] logf0s_mean_B = logf0s_normalization_params['mean_B'] logf0s_std_B = logf0s_normalization_params['std_B'] if not os.path.exists(output_dir): os.makedirs(output_dir) for i in trange(len(os.listdir(data_dir))): file = os.listdir(data_dir)[i] filepath = os.path.join(data_dir, file) wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period) coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features) coded_sp_transposed = coded_sp.T frame_size = 128 if conversion_direction == 'A2B': # pitch print("AtoB") if pc == True: print("pitch convert") f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B) else: print("pitch same") f0_converted = f0 # normalization A Domain coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A # padding remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False if coded_sp_norm.shape[1] % frame_size != 0: coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((24, remain))), axis=1) padd = True # inference for segmentation coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0] for i in range(1, coded_sp_norm.shape[1] // frame_size): ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]), direction=conversion_direction)[0] coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1) if padd == True: coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain] coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B else: print("BtoA") if pc == True: print("pitch convert") f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B) else: f0_converted = f0 # normalization B Domain coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B # padding remain, padd = frame_size - coded_sp_norm.shape[1] % frame_size, False if coded_sp_norm.shape[1] % frame_size != 0: coded_sp_norm = np.concatenate((coded_sp_norm, np.zeros((24, remain))), axis=1) padd = True # inference for segmentation coded_sp_converted_norm = model.test(inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0] for i in range(1, coded_sp_norm.shape[1] // frame_size): ccat = model.test(inputs=np.array([coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]), direction=conversion_direction)[0] coded_sp_converted_norm = np.concatenate((coded_sp_converted_norm, ccat), axis=1) if padd == True: coded_sp_converted_norm = coded_sp_converted_norm[:,:-remain] coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A # output translation value processing coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) # World vocoder synthesis wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period) librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir, MCEPs_dim, lambda_list): gen_loss_thres = 100.0 np.random.seed(random_seed) num_epochs = 2000 mini_batch_size = 1 generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 16000 num_mcep = MCEPs_dim frame_period = 5.0 n_frames = 128 lambda_cycle = lambda_list[0] lambda_identity = lambda_list[1] # **************************************************************** # *************************Loading DATA*************************** # **************************************************************** with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa: coded_sps_A_norm = pickle.load(fa) with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb: coded_sps_B_norm = pickle.load(fb) mcep_normalization_params = np.load( os.path.join(training_data_dir, 'mcep_normalization.npz')) coded_sps_A_mean = mcep_normalization_params['mean_A'] coded_sps_A_std = mcep_normalization_params['std_A'] coded_sps_B_mean = mcep_normalization_params['mean_B'] coded_sps_B_std = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load( os.path.join(training_data_dir, 'logf0s_normalization.npz')) log_f0s_mean_A = logf0s_normalization_params['mean_A'] log_f0s_std_A = logf0s_normalization_params['std_A'] log_f0s_mean_B = logf0s_normalization_params['mean_B'] log_f0s_std_B = logf0s_normalization_params['std_B'] if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) print("****************************************************************") print("*************************Start Training*************************") print("****************************************************************") # Model define model = CycleGAN2(num_features=num_mcep, log_dir=tensorboard_log_dir, model_name=model_name) epoch = 0 # load model if os.path.exists(os.path.join(model_dir, "checkpoint")) == True: f = open(os.path.join(model_dir, "checkpoint"), "r") all_ckpt = f.readlines() f.close() pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1] epoch = int(pretrain_ckpt.split("-")[1].split(".")[0]) if os.path.exists(os.path.join(model_dir, (pretrain_ckpt + ".index"))) == True: model.load(filepath=os.path.join(model_dir, pretrain_ckpt)) print("Loading pretrained model {}".format(pretrain_ckpt)) else: print("Training model from 1 epoch") for k in range(epoch + 1, num_epochs): print('Epoch: %d' % k) start_time_epoch = time.time() dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) n_samples = dataset_A.shape[0] # -------------------------------------------- one epoch learning -------------------------------------------- # for i in trange(n_samples // mini_batch_size): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max( 0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max( 0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss, generator_loss_A2B = model.train\ (input_A = dataset_A[start:end], input_B = dataset_B[start:end], lambda_cycle = lambda_cycle, lambda_identity = lambda_identity, generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate) #model.summary() # # Minimum AtoB loss model save # if gen_loss_thres > generator_loss_A2B: # gen_loss_thres = generator_loss_A2B # best_model_name = 'Bestmodel' + model_name # model.save(directory=model_dir, filename=best_model_name) # print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B) if i % (n_samples // 2) == 0: print( 'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) # Last model save if k == 1 or k % 100 == 0: print("Saving Epoch {}".format(k)) ckpt_name = model_name + "-" + str(k) + ".ckpt" model.save(directory=model_dir, filename=ckpt_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) # -------------------------------------------- one epoch learning -------------------------------------------- # # ------------------------------------------- validation inference ------------------------------------------- # if validation_A_dir is not None: if k % 500 == 0: print('Generating Validation Data B from A...') for i in trange(len(os.listdir(validation_A_dir))): file = os.listdir(validation_A_dir)[i] filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) # break if validation_B_dir is not None: if k % 500 == 0: print('Generating Validation Data A from B...') for i in trange(len(os.listdir(validation_B_dir))): file = os.listdir(validation_B_dir)[i] filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_B, std_log_src=log_f0s_std_B, mean_log_target=log_f0s_mean_A, std_log_target=log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def seg_and_pad(src, n_frames): n_origin = src.shape[1] n_padded = (n_origin // n_frames + 1) * n_frames left_pad = (n_padded - n_origin) // 2 right_pad = n_padded - n_origin - left_pad src = np.pad(src, [(0, 0), (left_pad, right_pad)], 'constant', constant_values=0) src = np.reshape(src, [-1, hp.num_mceps, n_frames]) return src model = CycleGAN2() latest = tf.train.latest_checkpoint(hp.weights_dir) model.load_weights(latest) print('Loading cached data...') with open('./datasets/JSUT/jsut.p', 'rb') as f: coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std, log_f0s_mean_A, log_f0s_std_A = pickle.load( f) with open('./datasets/target_voice/target_voice.p', 'rb') as f: coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std, log_f0s_mean_B, log_f0s_std_B = pickle.load( f) wav, _ = librosa.load('./outputs/100002.wav', sr=hp.rate) f0, timeaxis, sp, ap = world_decompose(wav, hp.rate) f0_converted = pitch_conversion(f0, log_f0s_mean_A, log_f0s_std_A,