def __init__(self, config): self.config = config self.filenames = librosa.util.find_files(config['data']['sample_set_dir']) self.set = {} for file in self.filenames: filename = os.path.basename(file) if filename[-1] == '/': filename = filename[0:-1] audio, labels = pp.get_samples_and_labels(filename, config) if config['data']['type'] == 'mel': spec = torch.Tensor(audio_utils.wav2spectrogram(audio)).t() melspec = torch.Tensor(audio_utils.wav2melspectrogram(audio)).t() self.set[filename] = [melspec, labels, spec] else: audio = np.array(audio, dtype = np.float64) f0, ap, sp, coded_sp = pw.cal_mcep(audio) coded_sp = torch.Tensor(coded_sp.T) self.set[filename] = [f0, ap, sp, coded_sp, labels]
def _single_conversion(filename, model, one_hot_emo): ''' THIS WON'T WORK RIGHT NOW, USE THE WORLD CONVERSION LOOP IN MAIN Call only from __main__ section in this module. Generates sample converted into each emotion. (str) filename - name.wav file to be converted (StarGAN-emo-VC1) model - pretrained model to perform conversion (torch.Tensor(long)) one_hot_emo - one hot encoding of emotion to convert to ''' wav, labels = pp.get_wav_and_labels(filenames[5], config['data']['dataset_dir']) wav = np.array(wav, dtype=np.double) f0, ap, sp, coded_sp = preprocess_world.cal_mcep(wav) coded_sp = coded_sp.T coded_sp_torch = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to( device=device) fake = model.G(coded_sp_torch, one_hot_emo.unsqueeze(0)) fake = fake.squeeze() print("Sampled size = ", fake.size()) converted_sp = fake.cpu().detach().numpy() converted_sp = np.array(converted_sp, dtype=np.float64) sample_length = converted_sp.shape[0] if sample_length != ap.shape[0]: ap = np.ascontiguousarray(ap[0:sample_length, :], dtype=np.float64) f0 = np.ascontiguousarray(f0[0:sample_length], dtype=np.float64) f0 = np.ascontiguousarray(f0[20:-20], dtype=np.float64) ap = np.ascontiguousarray(ap[20:-20, :], dtype=np.float64) converted_sp = np.ascontiguousarray(converted_sp[40:-40, :], dtype=np.float64) coded_sp = np.ascontiguousarray(coded_sp[20:-20, :], dtype=np.float64) target = np.argmax(one_hot_emo) out_name = filename[:-4] + str(labels[1]) + "to" + target + ".wav" audio_utils.save_world_wav([f0, ap, sp, converted_sp], model.name + '_converted', out_name)
# # coded_sp_temp = np.copy(coded_sp).T # # print(coded_sp_temp.shape) # filename_wav = f[0:-4] + "_" + str(int(labels[0].item())) + ".wav" # print(coded_sp.shape) # it = str(args.iteration)[0:3] # audio_utils.save_world_wav([f0,ap,sp,coded_sp], args.out_dir + '_evalSet', filename_wav) ######################################## # WORLD CONVERSION LOOP # ######################################## for file_num, f in enumerate(filenames): # wav, labels = pp.get_wav_and_labels(f, config['data']['dataset_dir']) wav = np.array(wav, dtype = np.float64) labels = np.array(labels) f0_real, ap_real, sp, coded_sp = preprocess_world.cal_mcep(wav) # coded_sp_temp = np.copy(coded_sp).T # print(coded_sp_temp.shape) coded_sp = coded_sp.T coded_sp = torch.Tensor(coded_sp).unsqueeze(0).unsqueeze(0).to(device = device) with torch.no_grad(): # print(emo_targets) for i in range (0, emo_targets.size(0)): f0 = np.copy(f0_real) ap = np.copy(ap_real) # coded_sp_temp_copy = np.copy(coded_sp_temp) # coded_sp = np.copy(coded_sp) f0 = audio_utils.f0_pitch_conversion(f0, (labels[0],labels[1]),