def conversion(model_filepath, img_dir, conversion_direction, output_dir): input_size = [256, 256, 3] num_filters = 64 model = CycleGAN(input_size=input_size, num_filters=num_filters, mode='test') model.load(filepath=model_filepath) if not os.path.exists(output_dir): os.makedirs(output_dir) for file in os.listdir(img_dir): filepath = os.path.join(img_dir, file) img = cv2.imread(filepath) img_height, img_width, img_channel = img.shape img = cv2.resize(img, (input_size[1], input_size[0])) img = image_scaling(imgs=img) img_converted = model.test(inputs=np.array([img]), direction=conversion_direction)[0] img_converted = image_scaling_inverse(imgs=img_converted) img_converted = cv2.resize(img_converted, (img_width, img_height)) cv2.imwrite(os.path.join(output_dir, os.path.basename(file)), img_converted)
def conversion(model_dir, model_name, data_dir, conversion_direction, output_dir): num_features = 24 sampling_rate = 16000 frame_period = 5.0 model = CycleGAN(num_features = num_features, mode = 'test') model.load(filepath = os.path.join(model_dir, model_name)) mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz')) mcep_mean_A = mcep_normalization_params['mean_A'] mcep_std_A = mcep_normalization_params['std_A'] mcep_mean_B = mcep_normalization_params['mean_B'] mcep_std_B = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz')) logf0s_mean_A = logf0s_normalization_params['mean_A'] logf0s_std_A = logf0s_normalization_params['std_A'] logf0s_mean_B = logf0s_normalization_params['mean_B'] logf0s_std_B = logf0s_normalization_params['std_B'] if not os.path.exists(output_dir): os.makedirs(output_dir) for file in os.listdir(data_dir): filepath = os.path.join(data_dir, file) wav, _ = librosa.load(filepath, sr = sampling_rate, mono = True) wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = sampling_rate, frame_period = frame_period) coded_sp = world_encode_spectral_envelop(sp = sp, fs = sampling_rate, dim = num_features) coded_sp_transposed = coded_sp.T if conversion_direction == 'A2B': f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_A, std_log_src = logf0s_std_A, mean_log_target = logf0s_mean_B, std_log_target = logf0s_std_B) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B else: f0_converted = pitch_conversion(f0 = f0, mean_log_src = logf0s_mean_B, std_log_src = logf0s_std_B, mean_log_target = logf0s_mean_A, std_log_target = logf0s_std_A) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B coded_sp_converted_norm = model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = sampling_rate) wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = sampling_rate, frame_period = frame_period) librosa.output.write_wav(os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def conversion(model_dir, model_name, data_dir, conversion_direction, output_dir, pc, generation_model): num_features = 32 sampling_rate = 44000 frame_period = 5.0 model = CycleGAN(num_features=num_features, mode='test', gen_model=generation_model) model.load(filepath=os.path.join(model_dir, model_name)) mcep_normalization_params = np.load( os.path.join(model_dir, 'mcep_normalization.npz')) mcep_mean_A = mcep_normalization_params['mean_A'] mcep_std_A = mcep_normalization_params['std_A'] mcep_mean_B = mcep_normalization_params['mean_B'] mcep_std_B = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load( os.path.join(model_dir, 'logf0s_normalization.npz')) logf0s_mean_A = logf0s_normalization_params['mean_A'] logf0s_std_A = logf0s_normalization_params['std_A'] logf0s_mean_B = logf0s_normalization_params['mean_B'] logf0s_std_B = logf0s_normalization_params['std_B'] if not os.path.exists(output_dir): os.makedirs(output_dir) for file in os.listdir(data_dir): filepath = os.path.join(data_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) # wav = wav_padding(wav = wav, sr = sampling_rate, frame_period = frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_features) coded_sp_transposed = coded_sp.T frame_size = 128 if conversion_direction == 'A2B': # pitch print("AtoB") if pc == True: print("pitch convert") f0_converted = pitch_conversion(f0=f0, mean_log_src=logf0s_mean_A, std_log_src=logf0s_std_A, mean_log_target=logf0s_mean_B, std_log_target=logf0s_std_B) else: print("pitch same") f0_converted = f0 # normalization A Domain coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A # padding remain, padd = frame_size - coded_sp_norm.shape[ 1] % frame_size, False if coded_sp_norm.shape[1] % frame_size != 0: coded_sp_norm = np.concatenate( (coded_sp_norm, np.zeros((32, remain))), axis=1) padd = True # inference for segmentation coded_sp_converted_norm = model.test( inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0] for i in range(1, coded_sp_norm.shape[1] // frame_size): ccat = model.test(inputs=np.array( [coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]), direction=conversion_direction)[0] coded_sp_converted_norm = np.concatenate( (coded_sp_converted_norm, ccat), axis=1) if padd == True: coded_sp_converted_norm = coded_sp_converted_norm[:, :-remain] coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B else: print("BtoA") if pc == True: print("pitch convert") f0_converted = pitch_conversion(f0=f0, mean_log_src=logf0s_mean_A, std_log_src=logf0s_std_A, mean_log_target=logf0s_mean_B, std_log_target=logf0s_std_B) else: f0_converted = f0 # normalization B Domain coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B # padding remain, padd = frame_size - coded_sp_norm.shape[ 1] % frame_size, False if coded_sp_norm.shape[1] % frame_size != 0: coded_sp_norm = np.concatenate( (coded_sp_norm, np.zeros((32, remain))), axis=1) padd = True # inference for segmentation coded_sp_converted_norm = model.test( inputs=np.array([coded_sp_norm[:, 0:frame_size]]), direction=conversion_direction)[0] for i in range(1, coded_sp_norm.shape[1] // frame_size): ccat = model.test(inputs=np.array( [coded_sp_norm[:, i * frame_size:(i + 1) * frame_size]]), direction=conversion_direction)[0] coded_sp_converted_norm = np.concatenate( (coded_sp_converted_norm, ccat), axis=1) if padd == True: coded_sp_converted_norm = coded_sp_converted_norm[:, :-remain] coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A # output translation value processing coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) # World vocoder synthesis wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
class Converter(): def __init__(self, model_dir, model_name): self.num_features = 24 self.sampling_rate = 16000 self.frame_period = 5.0 self.model = CycleGAN(num_features=self.num_features, mode='test') self.model.load(filepath=os.path.join(model_dir, model_name)) self.mcep_normalization_params = np.load( os.path.join(model_dir, 'mcep_normalization.npz')) self.mcep_mean_A = self.mcep_normalization_params['mean_A'] self.mcep_std_A = self.mcep_normalization_params['std_A'] self.mcep_mean_B = self.mcep_normalization_params['mean_B'] self.mcep_std_B = self.mcep_normalization_params['std_B'] self.logf0s_normalization_params = np.load( os.path.join(model_dir, 'logf0s_normalization.npz')) self.logf0s_mean_A = self.logf0s_normalization_params['mean_A'] self.logf0s_std_A = self.logf0s_normalization_params['std_A'] self.logf0s_mean_B = self.logf0s_normalization_params['mean_B'] self.logf0s_std_B = self.logf0s_normalization_params['std_B'] def convert_to_pcm_data(self, wav, conversion_direction='A2B'): wav = wav_padding(wav=wav, sr=self.sampling_rate, frame_period=self.frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=self.sampling_rate, frame_period=self.frame_period) coded_sp = world_encode_spectral_envelop(sp=sp, fs=self.sampling_rate, dim=self.num_features) coded_sp_transposed = coded_sp.T if conversion_direction == 'A2B': f0_converted = pitch_conversion(f0=f0, mean_log_src=self.logf0s_mean_A, std_log_src=self.logf0s_std_A, mean_log_target=self.logf0s_mean_B, std_log_target=self.logf0s_std_B) coded_sp_norm = (coded_sp_transposed - self.mcep_mean_A) / self.mcep_std_A coded_sp_converted_norm = self.model.test( inputs=np.array([coded_sp_norm]), direction=conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * self.mcep_std_B + self.mcep_mean_B else: f0_converted = pitch_conversion(f0=f0, mean_log_src=self.logf0s_mean_B, std_log_src=self.logf0s_std_B, mean_log_target=self.logf0s_mean_A, std_log_target=self.logf0s_std_A) coded_sp_norm = (coded_sp_transposed - self.mcep_mean_B) / self.mcep_std_B coded_sp_converted_norm = self.model.test( inputs=np.array([coded_sp_norm]), direction=conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * self.mcep_std_A + self.mcep_mean_A coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=self.sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=self.sampling_rate, frame_period=self.frame_period) # For debugging model output, uncomment the following line: # librosa.output.write_wav('model_output.wav', wav_transformed, self.sampling_rate) # TODO: Perhaps ditch this. It's probably unnecessary work. upsampled = librosa.resample(wav_transformed, self.sampling_rate, 48000) pcm_data = upsampled.astype(np.float64) stereo_pcm_data = np.tile(pcm_data, (2, 1)).T return stereo_pcm_data def convert_pcm_to_wav(self, stereo_pcm_data): buf = io.BytesIO() scipy.io.wavfile.write(buf, 48000, stereo_pcm_data.astype(np.float32)) return buf def convert(self, wav, conversion_direction='A2B'): stereo_pcm_data = self.convert_to_pcm_data( wav, conversion_direction=conversion_direction) return self.convert_pcm_to_wav(stereo_pcm_data)
def conversion(training_data_dir, model_dir, model_name, data_dir, conversion_direction, output_dir): num_features = 24 sampling_rate = 16000 frame_period = 5.0 model = CycleGAN(num_features=num_features, mode='test') if os.path.exists(os.path.join(model_dir, "checkpoint")) == True: f = open(os.path.join(model_dir, "checkpoint"), "r") all_ckpt = f.readlines() f.close() pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1] assert os.path.exists( os.path.join(model_dir, (pretrain_ckpt + ".index"))) == True, "The checkpoint is not exist." model.load(filepath=os.path.join(model_dir, pretrain_ckpt)) print("Loading pretrained model {}".format(pretrain_ckpt)) mcep_normalization_params = np.load( os.path.join(training_data_dir, 'mcep_normalization.npz')) mcep_mean_A = mcep_normalization_params['mean_A'] mcep_std_A = mcep_normalization_params['std_A'] mcep_mean_B = mcep_normalization_params['mean_B'] mcep_std_B = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load( os.path.join(training_data_dir, 'logf0s_normalization.npz')) logf0s_mean_A = logf0s_normalization_params['mean_A'] logf0s_std_A = logf0s_normalization_params['std_A'] logf0s_mean_B = logf0s_normalization_params['mean_B'] logf0s_std_B = logf0s_normalization_params['std_B'] if not os.path.exists(output_dir): os.makedirs(output_dir) for i in trange(len(os.listdir(data_dir))): file = os.listdir(data_dir)[i] filepath = os.path.join(data_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose(wav=wav, fs=sampling_rate, frame_period=frame_period) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_features) coded_sp_transposed = coded_sp.T if conversion_direction == 'A2B': f0_converted = pitch_conversion(f0=f0, mean_log_src=logf0s_mean_A, std_log_src=logf0s_std_A, mean_log_target=logf0s_mean_B, std_log_target=logf0s_std_B) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_A) / mcep_std_A coded_sp_converted_norm = model.test( inputs=np.array([coded_sp_norm]), direction=conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_B + mcep_mean_B else: f0_converted = pitch_conversion(f0=f0, mean_log_src=logf0s_mean_B, std_log_src=logf0s_std_B, mean_log_target=logf0s_mean_A, std_log_target=logf0s_std_A) #f0_converted = f0 coded_sp_norm = (coded_sp_transposed - mcep_mean_B) / mcep_std_B coded_sp_converted_norm = model.test( inputs=np.array([coded_sp_norm]), direction=conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * mcep_std_A + mcep_mean_A coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(train_A_dir, train_B_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir, tensorboard_log_dir, gen_model, MCEPs_dim, lambda_list, processed_data_dir): gen_loss_thres = 100.0 np.random.seed(random_seed) num_epochs = 5000 mini_batch_size = 1 generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 44000 num_mcep = MCEPs_dim frame_period = 5.0 n_frames = 128 lambda_cycle = lambda_list[0] lambda_identity = lambda_list[1] Speaker_A_features = os.path.join(processed_data_dir, 'wav_A.npz') Speaker_B_features = os.path.join(processed_data_dir, 'wav_B.npz') start_time = time.time() print('lookiong for preprocessed data in:{}'.format(processed_data_dir)) if os.path.exists(Speaker_A_features) and os.path.exists( Speaker_B_features): print('#### loading processed data #######') f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = load_speaker_features( Speaker_A_features) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = load_speaker_features( Speaker_B_features) else: print('Preprocessing Data...') if not os.path.exists(processed_data_dir): os.makedirs(processed_data_dir) wavs_A = load_wavs(wav_dir=train_A_dir, sr=sampling_rate) f0s_A, timeaxes_A, sps_A, aps_A, coded_sps_A = world_encode_data( wavs=wavs_A, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) np.savez(Speaker_A_features, f0s=f0s_A, timeaxes=timeaxes_A, sps=sps_A, aps=aps_A, coded_sps=coded_sps_A) del wavs_A wavs_B = load_wavs(wav_dir=train_B_dir, sr=sampling_rate) f0s_B, timeaxes_B, sps_B, aps_B, coded_sps_B = world_encode_data( wavs=wavs_B, fs=sampling_rate, frame_period=frame_period, coded_dim=num_mcep) np.savez(Speaker_B_features, f0s=f0s_B, timeaxes=timeaxes_B, sps=sps_B, aps=aps_B, coded_sps=coded_sps_B) del wavs_B print('Data preprocessing finished !') return log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) print('Log Pitch A') print('Mean: %f, Std: %f' % (log_f0s_mean_A, log_f0s_std_A)) print('Log Pitch B') print('Mean: %f, Std: %f' % (log_f0s_mean_B, log_f0s_std_B)) coded_sps_A, f0s_A = remove_radical_pitch_samples(f0s_A, coded_sps_A, log_f0s_mean_A, log_f0s_std_A) coded_sps_B, f0s_B = remove_radical_pitch_samples(f0s_B, coded_sps_B, log_f0s_mean_B, log_f0s_std_B) print('recalculating mean and std of radical cleared f0s') log_f0s_mean_A, log_f0s_std_A = logf0_statistics(f0s_A) log_f0s_mean_B, log_f0s_std_B = logf0_statistics(f0s_B) coded_sps_A_transposed = transpose_in_list(lst=coded_sps_A) coded_sps_B_transposed = transpose_in_list(lst=coded_sps_B) print("Input data fixed.") coded_sps_A_norm, coded_sps_A_mean, coded_sps_A_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_A_transposed) coded_sps_B_norm, coded_sps_B_mean, coded_sps_B_std = coded_sps_normalization_fit_transoform( coded_sps=coded_sps_B_transposed) if not os.path.exists(model_dir): os.makedirs(model_dir) np.savez(os.path.join(model_dir, 'logf0s_normalization.npz'), mean_A=log_f0s_mean_A, std_A=log_f0s_std_A, mean_B=log_f0s_mean_B, std_B=log_f0s_std_B) np.savez(os.path.join(model_dir, 'mcep_normalization.npz'), mean_A=coded_sps_A_mean, std_A=coded_sps_A_std, mean_B=coded_sps_B_mean, std_B=coded_sps_B_std) if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) end_time = time.time() time_elapsed = end_time - start_time print('Preprocessing Done.') print('Time Elapsed for Data Preprocessing: %02d:%02d:%02d' % (time_elapsed // 3600, (time_elapsed % 3600 // 60), (time_elapsed % 60 // 1))) # ---------------------------------------------- Data preprocessing ---------------------------------------------- # # Model define model = CycleGAN(num_features=num_mcep, log_dir=tensorboard_log_dir, model_name=model_name, gen_model=gen_model) # load model if os.path.exists(os.path.join(model_dir, (model_name + ".index"))) == True: model.load(filepath=os.path.join(model_dir, model_name)) # =================================================== Training =================================================== # for epoch in range(num_epochs): print('Epoch: %d' % epoch) start_time_epoch = time.time() dataset_A, dataset_B = sample_train_data(dataset_A=coded_sps_A_norm, dataset_B=coded_sps_B_norm, n_frames=n_frames) n_samples = dataset_A.shape[0] # -------------------------------------------- one epoch learning -------------------------------------------- # for i in tqdm.tqdm(range(n_samples // mini_batch_size)): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max( 0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max( 0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss, generator_loss_A2B = model.train\ (input_A = dataset_A[start:end], input_B = dataset_B[start:end], lambda_cycle = lambda_cycle, lambda_identity = lambda_identity, generator_learning_rate = generator_learning_rate, discriminator_learning_rate = discriminator_learning_rate) # issue #4, # model.summary() # Minimum AtoB loss model save # if gen_loss_thres > generator_loss_A2B: # gen_loss_thres = generator_loss_A2B # best_model_name = 'Bestmodel' + model_name # model.save(directory=model_dir, filename=best_model_name) # print("generator loss / generator A2B loss ", generator_loss, generator_loss_A2B) if i % 50 == 0: print( 'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) # Last model save if epoch % 10 == 0: model.save(directory=model_dir, filename=model_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) # -------------------------------------------- one epoch learning -------------------------------------------- # # ------------------------------------------- validation inference ------------------------------------------- # if validation_A_dir is not None: # if epoch % 50 == 0: if epoch % 10 == 0: print('Generating Validation Data B from A...') for file in os.listdir(validation_A_dir): filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) # break if validation_B_dir is not None: # if epoch % 50 == 0: if epoch % 10 == 0: print('Generating Validation Data A from B...') for file in os.listdir(validation_B_dir): filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_B, std_log_src=log_f0s_std_B, mean_log_target=log_f0s_mean_A, std_log_target=log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(train_A_dir, train_B_dir, training_data_dir, model_dir, model_name, random_seed, validation_A_dir, validation_B_dir, output_dir): np.random.seed(random_seed) num_epochs = 2000 mini_batch_size = 1 generator_learning_rate = 0.0002 generator_learning_rate_decay = generator_learning_rate / 200000 discriminator_learning_rate = 0.0001 discriminator_learning_rate_decay = discriminator_learning_rate / 200000 sampling_rate = 16000 num_mcep = 24 frame_period = 5.0 n_frames = 128 lambda_cycle = 10 lambda_identity = 5 # **************************************************************** # *************************Loading DATA*************************** # **************************************************************** with open(os.path.join(training_data_dir, 'A_coded_norm.pk'), "rb") as fa: coded_sps_A_norm = pickle.load(fa) with open(os.path.join(training_data_dir, 'B_coded_norm.pk'), "rb") as fb: coded_sps_B_norm = pickle.load(fb) mcep_normalization_params = np.load( os.path.join(training_data_dir, 'mcep_normalization.npz')) coded_sps_A_mean = mcep_normalization_params['mean_A'] coded_sps_A_std = mcep_normalization_params['std_A'] coded_sps_B_mean = mcep_normalization_params['mean_B'] coded_sps_B_std = mcep_normalization_params['std_B'] logf0s_normalization_params = np.load( os.path.join(training_data_dir, 'logf0s_normalization.npz')) log_f0s_mean_A = logf0s_normalization_params['mean_A'] log_f0s_std_A = logf0s_normalization_params['std_A'] log_f0s_mean_B = logf0s_normalization_params['mean_B'] log_f0s_std_B = logf0s_normalization_params['std_B'] if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) print("****************************************************************") print("*************************Start Training*************************") print("****************************************************************") model = CycleGAN(num_features=num_mcep) epoch = 0 # load model if os.path.exists(os.path.join(model_dir, "checkpoint")) == True: f = open(os.path.join(model_dir, "checkpoint"), "r") all_ckpt = f.readlines() f.close() pretrain_ckpt = all_ckpt[-1].split("\n")[0].split("\"")[1] epoch = int(pretrain_ckpt.split("-")[1].split(".")[0]) if os.path.exists(os.path.join(model_dir, (pretrain_ckpt + ".index"))) == True: model.load(filepath=os.path.join(model_dir, pretrain_ckpt)) print("Loading pretrained model {}".format(pretrain_ckpt)) else: print("Training model from 0 epoch") for k in range(epoch + 1, num_epochs): print('Epoch: %d' % k) start_time_epoch = time.time() pool_A, pool_B = list(coded_sps_A_norm), list(coded_sps_B_norm) dataset_A, dataset_B = sample_train_data(dataset_A=pool_A, dataset_B=pool_B, n_frames=n_frames) print('dataset_A', np.shape(dataset_A), 'dataset_B', np.shape(dataset_B)) n_samples = dataset_A.shape[0] for i in trange(n_samples // mini_batch_size): num_iterations = n_samples // mini_batch_size * epoch + i if num_iterations > 10000: lambda_identity = 0 if num_iterations > 200000: generator_learning_rate = max( 0, generator_learning_rate - generator_learning_rate_decay) discriminator_learning_rate = max( 0, discriminator_learning_rate - discriminator_learning_rate_decay) start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss = model.train( input_A=dataset_A[start:end], input_B=dataset_B[start:end], lambda_cycle=lambda_cycle, lambda_identity=lambda_identity, generator_learning_rate=generator_learning_rate, discriminator_learning_rate=discriminator_learning_rate) if i % (n_samples // 2) == 0: print( 'Iteration: {:07d}, Generator Learning Rate: {:.7f}, Discriminator Learning Rate: {:.7f}, Generator Loss : {:.3f}, Discriminator Loss : {:.3f}' .format(num_iterations, generator_learning_rate, discriminator_learning_rate, generator_loss, discriminator_loss)) if k == 1 or k % 100 == 0: print("Saving Epoch {}".format(k)) ckpt_name = model_name + "-" + str(k) + ".ckpt" model.save(directory=model_dir, filename=ckpt_name) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1))) if validation_A_dir is not None: if k % 300 == 0: print('Generating Validation Data B from A...') for i in trange(len(os.listdir(validation_A_dir))): file = os.listdir(validation_A_dir)[i] filepath = os.path.join(validation_A_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_A, std_log_src=log_f0s_std_A, mean_log_target=log_f0s_mean_B, std_log_target=log_f0s_std_B) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_A_mean) / coded_sps_A_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='A2B')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_B_std + coded_sps_B_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_A_output_dir, os.path.basename(file)), wav_transformed, sampling_rate) if validation_B_dir is not None: if k % 300 == 0: print('Generating Validation Data A from B...') for i in trange(len(os.listdir(validation_B_dir))): file = os.listdir(validation_A_dir)[i] filepath = os.path.join(validation_B_dir, file) wav, _ = librosa.load(filepath, sr=sampling_rate, mono=True) wav = wav_padding(wav=wav, sr=sampling_rate, frame_period=frame_period, multiple=4) f0, timeaxis, sp, ap = world_decompose( wav=wav, fs=sampling_rate, frame_period=frame_period) f0_converted = pitch_conversion( f0=f0, mean_log_src=log_f0s_mean_B, std_log_src=log_f0s_std_B, mean_log_target=log_f0s_mean_A, std_log_target=log_f0s_std_A) coded_sp = world_encode_spectral_envelop(sp=sp, fs=sampling_rate, dim=num_mcep) coded_sp_transposed = coded_sp.T coded_sp_norm = (coded_sp_transposed - coded_sps_B_mean) / coded_sps_B_std coded_sp_converted_norm = model.test(inputs=np.array( [coded_sp_norm]), direction='B2A')[0] coded_sp_converted = coded_sp_converted_norm * coded_sps_A_std + coded_sps_A_mean coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray( coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop( coded_sp=coded_sp_converted, fs=sampling_rate) wav_transformed = world_speech_synthesis( f0=f0_converted, decoded_sp=decoded_sp_converted, ap=ap, fs=sampling_rate, frame_period=frame_period) librosa.output.write_wav( os.path.join(validation_B_output_dir, os.path.basename(file)), wav_transformed, sampling_rate)
def train(img_A_dir, img_B_dir, model_dir, model_name, random_seed, batch_size_maximum, validation_A_dir, validation_B_dir, output_dir, lambda_cycle, loss_function, tensorboard_log_dir): np.random.seed(random_seed) num_epochs = argv.epochs mini_batch_size = 1 # mini_batch_size = 1 is better learning_rate = 0.0002 input_size = [argv.fine_size_h, argv.fine_size_w, 3] #num_filters = 64 # Tried num_filters = 8 still not good for 200 epochs num_filters = argv.filter_number if validation_A_dir is not None: validation_A_output_dir = os.path.join(output_dir, 'converted_A') if not os.path.exists(validation_A_output_dir): os.makedirs(validation_A_output_dir) if validation_B_dir is not None: validation_B_output_dir = os.path.join(output_dir, 'converted_B') if not os.path.exists(validation_B_output_dir): os.makedirs(validation_B_output_dir) model = CycleGAN(input_size=input_size, num_filters=num_filters, mode='train', lambda_cycle=lambda_cycle, loss_function=loss_function, log_dir=tensorboard_log_dir) dataset_A_raw = load_data(img_dir=img_A_dir, load_size_w=argv.load_size_w, load_size_h=argv.load_size_h) dataset_B_raw = load_data(img_dir=img_B_dir, load_size_w=argv.load_size_w, load_size_h=argv.load_size_h) if argv.checkpoint is not None: print('loading model from checkpoint') model.load(argv.checkpoint) for epoch in range(num_epochs): print('Epoch: %d' % epoch) start_time_epoch = time.time() #dataset_A, dataset_B = sample_train_data(dataset_A_raw, dataset_B_raw, load_size = 286, output_size = 256, batch_size_maximum = batch_size_maximum) dataset_A, dataset_B = sample_train_data( dataset_A_raw, dataset_B_raw, load_size_w=argv.load_size_w, load_size_h=argv.load_size_h, output_size_w=argv.fine_size_w, output_size_h=argv.fine_size_h, batch_size_maximum=batch_size_maximum) n_samples = dataset_A.shape[0] for i in range(n_samples // mini_batch_size): start = i * mini_batch_size end = (i + 1) * mini_batch_size generator_loss, discriminator_loss = model.train( input_A=dataset_A[start:end], input_B=dataset_B[start:end], learning_rate=learning_rate) if i % 50 == 0: print( 'Minibatch: %d, Generator Loss : %f, Discriminator Loss : %f' % (i, generator_loss, discriminator_loss)) #model.save(directory = model_dir, filename = model_name) model.save(directory=model_dir, filename=model_name + '_' + str(epoch)) if validation_A_dir is not None: final_output_dir = os.path.join(validation_A_output_dir, str(epoch)) if not os.path.exists(final_output_dir): os.makedirs(final_output_dir) for file in os.listdir(validation_A_dir): filepath = os.path.join(validation_A_dir, file) img = cv2.imread(filepath) img_height, img_width, img_channel = img.shape img = cv2.resize(img, (input_size[1], input_size[0])) img = image_scaling(imgs=img) img_converted = model.test(inputs=np.array([img]), direction='A2B')[0] img_converted = image_scaling_inverse(imgs=img_converted) img_converted = cv2.resize(img_converted, (img_width, img_height)) cv2.imwrite( os.path.join(final_output_dir, os.path.basename(file)), img_converted) if validation_B_dir is not None: final_output_dir = os.path.join(validation_B_output_dir, str(epoch)) if not os.path.exists(final_output_dir): os.makedirs(final_output_dir) for file in os.listdir(validation_B_dir): filepath = os.path.join(validation_B_dir, file) img = cv2.imread(filepath) img_height, img_width, img_channel = img.shape img = cv2.resize(img, (input_size[1], input_size[0])) img = image_scaling(imgs=img) img_converted = model.test(inputs=np.array([img]), direction='B2A')[0] img_converted = image_scaling_inverse(imgs=img_converted) img_converted = cv2.resize(img_converted, (img_width, img_height)) cv2.imwrite( os.path.join(final_output_dir, os.path.basename(file)), img_converted) end_time_epoch = time.time() time_elapsed_epoch = end_time_epoch - start_time_epoch print('Time Elapsed for This Epoch: %02d:%02d:%02d' % (time_elapsed_epoch // 3600, (time_elapsed_epoch % 3600 // 60), (time_elapsed_epoch % 60 // 1)))
class Converter(): def __init__(self, model_dir, model_name): self.num_features = 24 self.sampling_rate = 16000 self.frame_period = 5.0 self.model = CycleGAN(num_features = self.num_features, mode = 'test') self.model.load(filepath = os.path.join(model_dir, model_name)) # NB: Save the graph definition = self.model.sess.graph_def directory = 'saved_model_2' tf.train.write_graph(definition, directory, 'saved_model_2.pb', as_text=True) # https://github.com/tensorflow/models/issues/3530#issuecomment-395968881 output_dir = './saved_model/' builder = tf.saved_model.builder.SavedModelBuilder(output_dir) builder.add_meta_graph_and_variables( self.model.sess, [tf.saved_model.tag_constants.SERVING], main_op=tf.tables_initializer(), ) builder.save() """ builder.add_meta_graph_and_variables( self.model.sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_images': prediction_signature, signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, main_op=tf.tables_initializer()) """ self.mcep_normalization_params = np.load(os.path.join(model_dir, 'mcep_normalization.npz')) self.mcep_mean_A = self.mcep_normalization_params['mean_A'] self.mcep_std_A = self.mcep_normalization_params['std_A'] self.mcep_mean_B = self.mcep_normalization_params['mean_B'] self.mcep_std_B = self.mcep_normalization_params['std_B'] self.logf0s_normalization_params = np.load(os.path.join(model_dir, 'logf0s_normalization.npz')) self.logf0s_mean_A = self.logf0s_normalization_params['mean_A'] self.logf0s_std_A = self.logf0s_normalization_params['std_A'] self.logf0s_mean_B = self.logf0s_normalization_params['mean_B'] self.logf0s_std_B = self.logf0s_normalization_params['std_B'] def convert(self, wav, conversion_direction='A2B'): wav = wav_padding(wav = wav, sr = self.sampling_rate, frame_period = self.frame_period, multiple = 4) f0, timeaxis, sp, ap = world_decompose(wav = wav, fs = self.sampling_rate, frame_period = self.frame_period) coded_sp = world_encode_spectral_envelop(sp = sp, fs = self.sampling_rate, dim = self.num_features) coded_sp_transposed = coded_sp.T if conversion_direction == 'A2B': f0_converted = pitch_conversion(f0 = f0, mean_log_src = self.logf0s_mean_A, std_log_src = self.logf0s_std_A, mean_log_target = self.logf0s_mean_B, std_log_target = self.logf0s_std_B) coded_sp_norm = (coded_sp_transposed - self.mcep_mean_A) / self.mcep_std_A coded_sp_converted_norm = self.model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * self.mcep_std_B + self.mcep_mean_B else: f0_converted = pitch_conversion(f0 = f0, mean_log_src = self.logf0s_mean_B, std_log_src = self.logf0s_std_B, mean_log_target = self.logf0s_mean_A, std_log_target = self.logf0s_std_A) coded_sp_norm = (coded_sp_transposed - self.mcep_mean_B) / self.mcep_std_B coded_sp_converted_norm = self.model.test(inputs = np.array([coded_sp_norm]), direction = conversion_direction)[0] coded_sp_converted = coded_sp_converted_norm * self.mcep_std_A + self.mcep_mean_A coded_sp_converted = coded_sp_converted.T coded_sp_converted = np.ascontiguousarray(coded_sp_converted) decoded_sp_converted = world_decode_spectral_envelop(coded_sp = coded_sp_converted, fs = self.sampling_rate) wav_transformed = world_speech_synthesis(f0 = f0_converted, decoded_sp = decoded_sp_converted, ap = ap, fs = self.sampling_rate, frame_period = self.frame_period) # For debugging model output, uncomment the following line: # librosa.output.write_wav('model_output.wav', wav_transformed, self.sampling_rate) # TODO: Perhaps ditch this. It's probably unnecessary work. upsampled = librosa.resample(wav_transformed, self.sampling_rate, 48000) pcm_data = upsampled.astype(np.float64) stereo_pcm_data = np.tile(pcm_data, (2,1)).T buf = io.BytesIO() scipy.io.wavfile.write(buf, 48000, stereo_pcm_data.astype(np.float32)) return buf