def _process_utterance(out_dir, in_dir, source_wav_name, target_wav_name, emotion_id): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: source_wav = audio.load_wav(os.path.join(in_dir, source_wav_name)) target_wav = audio.load_wav(os.path.join(in_dir, target_wav_name)) if hparams.rescaling: source_wav = source_wav / np.abs( source_wav).max() * hparams.rescaling_max target_wav = target_wav / np.abs( target_wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: #s_spectrogram = audio.spectrogram(source_wav).astype(np.float32) t_spectrogram = audio.spectrogram(target_wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: smel_spectrogram = audio.melspectrogram(source_wav).astype(np.float32) tmel_spectrogram = audio.melspectrogram(target_wav).astype(np.float32) s_n_frames = smel_spectrogram.shape[1] t_n_frames = tmel_spectrogram.shape[1] # Write the spectrograms to disk: #s_spectrogram_filename = 'source-spec-{}.npy'.format(source_wav_name) t_spectrogram_filename = 'target-spec-{}.npy'.format( target_wav_name.replace('.wav', '')) smel_filename = 'source-mel-{}.npy'.format( source_wav_name.replace('.wav', '')) tmel_filename = 'target-mel-{}.npy'.format( target_wav_name.replace('.wav', '')) #np.save(os.path.join(out_dir, s_spectrogram_filename), s_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, t_spectrogram_filename), t_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, smel_filename), smel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tmel_filename), tmel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (emotion_id, t_spectrogram_filename, smel_filename, tmel_filename, s_n_frames, t_n_frames)
def load_one_person_m_utterances(speaker_dir: Path, utterances_num, extension="wav"): """ 返回一个speaker的utterances_num条音频,每条音频是经过分割的 返回格式: [ [bsz1,n_frames,n_mels], [bsz2,n_frames,n_mels], ... ] """ wav_path_cl = [ wav_path for wav_path in speaker_dir.glob("*.{}".format(extension)) ] wav_path_cl_samples = random.sample( wav_path_cl, utterances_num) # 随机抽样utterances_num条音频 utterances_mels_cl = [] for fp in wav_path_cl_samples: wav, sr = audio.load_wav(str(fp)) bsz_n_frames_mel = slice_utterance_mel( wav) # 这里的shape=[bsz,n_frames,n_mels],bsz不是定值 utterances_mels_cl.append(bsz_n_frames_mel) return utterances_mels_cl
def load_multi_person_one_utterances(read_dir: Path, n_speaker, extension="wav"): """ 后去n个speaker,每个speaker去随机去一段语音 :param read_dir: 保持spearker的文件夹 :param n_speaker: n个speaker :return: [ [bsz1,n_frames,n_mels], [bsz2,n_frames,n_mels], ... ] """ speaker_dirs = [speak_dir for speak_dir in read_dir.glob("*")] n_speaker_dirs = random.sample(speaker_dirs, n_speaker) n_speaker_utterance_mel_cl = [] for speaker_dir in n_speaker_dirs: utterancs_wavs_path = [ str(wav_path) for wav_path in Path(str(speaker_dir)).glob( "*.{}".format(extension)) ] wav_path = random.sample(utterancs_wavs_path, 1)[0] # 随机抽样1条音频 wav, sr = audio.load_wav(wav_path) bsz_n_frames_mel = slice_utterance_mel( wav) # 这里的shape=[bsz,n_frames,n_mels],bsz不是定值 n_speaker_utterance_mel_cl.append(bsz_n_frames_mel) return n_speaker_utterance_mel_cl
def gen_samples(out_dir, wav_path, n_samples): wav = audio.load_wav(wav_path) hop_size = hparams.hop_length seg_len = hparams.seg_len spec_len = hparams.spec_len # not sure why we have to minus 1 here ? wav_len = wav.shape[0] // hop_size * hop_size - 1 wav = wav[:wav_len] spec = audio.spectrogram(wav) mel = audio.melspectrogram(wav) max_val = spec.shape[1] - 1 - spec_len if max_val < 0: return [] idx = np.random.randint(0, max_val, size=(n_samples)) d = [] i = 0 for offset in idx: i += 1 w = wav[offset * hop_size:offset * hop_size + seg_len] s = spec[:, offset:offset + spec_len] m = mel[:, offset:offset + spec_len] wav_name = wav_path.split('/')[-1].split('.')[0] file_path = "{0}/{1}_{2:03d}.npz".format(out_dir, wav_name, i) np.savez(file_path, wav=w, spec=s, mel=m) d.append(file_path) return d
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'vctk-spec-%05d.npy' % index mel_filename = 'vctk-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return spectrogram_filename, mel_filename, n_frames, text, speaker_id
def gen_data(audio_path, full_frames): wav = audio.load_wav(audio_path, 16000) mel = audio.melspectrogram(wav) print(mel.shape) if np.isnan(mel.reshape(-1)).sum() > 0: raise ValueError( 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again' ) mel_chunks = [] mel_idx_multiplier = 80. / fps i = 0 while 1: start_idx = int(i * mel_idx_multiplier) if start_idx + mel_step_size > len(mel[0]): mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) break mel_chunks.append(mel[:, start_idx:start_idx + mel_step_size]) i += 1 print("Length of mel chunks: {}".format(len(mel_chunks))) full_frames = full_frames[:len(mel_chunks)] gen = datagen(full_frames.copy(), mel_chunks) return gen
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'nikl-single-spec-%05d.npy' % index mel_filename = 'nikl-single-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def extract_MFCC_and_text(wav_file_path, mfcc_dir): wav_filenames = glob.glob(wav_file_path) for wav_fname in wav_filenames: text_filename = wav_fname.replace(".WAV.wav", ".TXT") fullname = wav_fname.split('/')[-1] fname = fullname.split('.')[0] # Process the text: remove the first two numbers from the text file with open(text_filename, 'r') as file: sentence = file.read() sentence = sentence.split()[2:] + ['\n'] sentence = ' '.join(sentence).lower() # Write the prcoeesed text to the mfcc directory text_fname = mfcc_dir + '/' + fname + '.txt' with open(text_fname, "w") as file: file.write(sentence) # Generate the MFCC features wav = audio.load_wav(wav_fname) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mspec_fname = mfcc_dir + '/' + fname np.save(mspec_fname, mel_spectrogram, allow_pickle=False) #generates features of shape: L x 80 return
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def main(): #这一部分用于处理LJSpeech格式的数据集 a = open(meta_path, 'r').readlines() b = [] i = 0 while i < len(a): t = a[i][0:6] b.append(t) i += 2 print(b[:2]) a = b # a = [i.strip().split('|')[0] for i in a] cnt = 0 cnt_list = [] bad_cnt = 0 bad_list = [] for fname in tqdm(a): try: # 提取声学参数 wav_f = os.path.join(wav_dir, fname + '.wav') wav_arr = load_wav(wav_f) mfcc_feats = wav2unnormalized_mfcc(wav_arr) mel_feats = wav2normalized_db_mel(wav_arr) spec_feats = wav2normalized_db_spec(wav_arr) # 验证声学参数提取的对 save_name = fname + '.npy' save_mel_rec_name = fname + '_mel_rec.wav' save_spec_rec_name = fname + '_spec_rec.wav' # 这句话有可能错,不知道为什么,可能是服务器临时变动有关 ppg_already_feats = np.load(os.path.join(ppg_dir, save_name)) assert ppg_already_feats.shape[0] == mfcc_feats.shape[0] assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0] write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats)) write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats)) # 存储声学参数 mfcc_save_name = os.path.join(mfcc_dir, save_name) mel_save_name = os.path.join(mel_dir, save_name) spec_save_name = os.path.join(spec_dir, save_name) np.save(mfcc_save_name, mfcc_feats) np.save(mel_save_name, mel_feats) np.save(spec_save_name, spec_feats) f_good_meta.write(fname + '\n') cnt_list.append(fname) cnt += 1 except: bad_list.append(fname) bad_cnt += 1 # print(cnt) # break print(cnt) print('bad:', bad_cnt) print(bad_list) return
def main(): #这一部分用于处理LibriSpeech格式的数据集。 for first_dir in server_common_data_list: for second_dir in os.listdir(os.path.join(wav_dir, first_dir)): second_wav_dir = os.path.join(os.path.join(wav_dir,first_dir),second_dir) wav_files = [os.path.join(second_wav_dir, f) for f in os.listdir(second_wav_dir) if f.endswith('.wav')] cnt = 0 for wav_f in wav_files: try: wav_arr = load_wav(wav_f, sr=hparams['sample_rate']) mfcc_feats = wav2mfcc_v2(wav_arr, sr=hparams['sample_rate'], n_mfcc=hparams['n_mfcc'], n_fft=hparams['n_fft'], hop_len=hparams['hop_length'], win_len=hparams['win_length'], window=hparams['window'], num_mels=hparams['num_mels'], center=hparams['center']) save_name = wav_f.split('/')[-1].split('.')[0] + '.npy' save_name = os.path.join(mfcc_dir, save_name) np.save(save_name, mfcc_feats) cnt += 1 print(cnt) except: print(wav_f) # break # break # break # 提取完毕以后,需要手动将3个文件夹的东西mv到同一个,和ppg一样的2338个文件夹 return
def process(info_dict): wav_path = os.path.join(hp.data_path, "Wave") wav_file_name = os.path.join(wav_path, info_dict["sentence_id"]+".wav") wav = audio.load_wav(wav_file_name) mel = audio.melspectrogram(wav).T mel_file_path = os.path.join(hp.mel_path, info_dict["sentence_id"]+".npy") np.save(mel_file_path, mel) phone_idx = info_dict["sentence_id"] + "|" for phone_duration in info_dict["alignment"]: phone_idx += str(phone_map[phone_duration[0]]) + " " duration_idx = info_dict["sentence_id"] + "|" length_mel = mel.shape[0] length_phone_list = len(info_dict["alignment"]) cur_pointer = 0 for frame_id in range(length_mel): added = False cur_time = hp.frame_length_ms / 2 + frame_id * hp.frame_shift_ms cur_time = cur_time / 1000.0 for i in range(cur_pointer, length_phone_list): if cur_time >= info_dict["alignment"][i][1][0] and cur_time < info_dict["alignment"][i][1][1]: phone_id = phone_map[info_dict["alignment"][i][0]] duration_idx += str(phone_id) + " " cur_pointer = i added = True break if not added: phone_id = phone_map[info_dict["alignment"][cur_pointer][0]] duration_idx += str(phone_id) + " " return phone_idx[:-1], duration_idx[:-1]
def __getitem__(self, index): # Read audio filename = self.audio_files[index] wav = deepaudio.load_wav(filename) # load in raw_audio via utils raw_audio, _ = utils.load_wav_to_torch(filename) # convert wav to numpy audio = torch.from_numpy(wav) # take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] # update raw audio as well raw_audio = raw_audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data # pad raw audio as well raw_audio = torch.nn.functional.pad( raw_audio, (0, self.segment_length - raw_audio.size(0)), 'constant').data # compute mel mel = deepaudio.melspectrogram(audio.numpy()) # convert mel to torch mel = torch.from_numpy(mel) audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE, self.mu_quantization) return (mel, audio)
def test(): wavs_path = os.path.join("data", "LJSpeech-1.1") wavs_path = os.path.join(wavs_path, "wavs") wav_path = os.path.join(wavs_path, "LJ001-0001.wav") wav = audio.load_wav(wav_path) mel_spec = audio.melspectrogram(wav) wav_after_inv = audio.inv_mel_spectrogram(mel_spec) audio.save_wav(wav_after_inv, "test.wav")
def __getitem__(self, idx): while 1: idx = random.randint(0, len(self.all_videos) - 1) vidname = self.all_videos[idx] img_names = list(glob(join(vidname, '*.jpg'))) if len(img_names) <= 3 * syncnet_T: continue img_name = random.choice(img_names) wrong_img_name = random.choice(img_names) while wrong_img_name == img_name: wrong_img_name = random.choice(img_names) window_fnames = self.get_window(img_name) wrong_window_fnames = self.get_window(wrong_img_name) if window_fnames is None or wrong_window_fnames is None: continue window = self.read_window(window_fnames) if window is None: continue wrong_window = self.read_window(wrong_window_fnames) if wrong_window is None: continue try: wavpath = join(vidname, "audio.wav") if wavpath not in self.shared_dict: wav = audio.load_wav(wavpath, hparams.sample_rate) orig_mel = audio.melspectrogram(wav).T self.shared_dict[wavpath] = orig_mel else: orig_mel = self.shared_dict[wavpath] except Exception as e: continue mel = self.crop_audio_window(orig_mel.copy(), img_name) if (mel.shape[0] != syncnet_mel_step_size): continue indiv_mels = self.get_segmented_mels(orig_mel.copy(), img_name) if indiv_mels is None: continue window = self.prepare_window(window) y = window.copy() window[:, :, window.shape[2]//2:] = 0. wrong_window = self.prepare_window(wrong_window) x = np.concatenate([window, wrong_window], axis=0) x = torch.FloatTensor(x) mel = torch.FloatTensor(mel.T).unsqueeze(0) indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1) y = torch.FloatTensor(y) # print(x.shape) return x, indiv_mels, mel, y
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) #world parameters f0, sp, ap = audio.world(wav, hparams.sample_rate) f0 = (f0 / hparams.f0_norm).astype(np.float32) #normalize sp = audio._normalize(sp).astype(np.float32) ap = ap.astype(np.float32) #apは0~1の範囲しか値を取らないので正規化不要 world_frames = f0.shape[0] # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index f0_filename = 'ljspeech-f0-%05d.npy' % index sp_filename = 'ljspeech-sp-%05d.npy' % index ap_filename = 'ljspeech-ap-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, f0_filename), f0, allow_pickle=False) np.save(os.path.join(out_dir, sp_filename), sp, allow_pickle=False) np.save(os.path.join(out_dir, ap_filename), ap, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, f0_filename, sp_filename, ap_filename, world_frames, text) '''
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max try: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) except Exception as e: print("Problem with :", wav_path) print(e) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # mdda added START : wav_filename = mel_filename.replace('-mel-', '-audio-') #wav_samples = hparams.fft_size + (n_frames-1)*hparams.hop_size # No : 3 extra frames added : Don't bother chomping np.save(os.path.join(out_dir, wav_filename), wav.astype(np.float32), allow_pickle=False) spectrogramraw_filename = 'ljspeech-specraw-%05d.npy' % index np.save(os.path.join(out_dir, spectrogramraw_filename), spectrogram_raw(wav).T, allow_pickle=False) # mdda added END # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def __getitem__(self, index): text = self.metadata.iloc[index]['text'] wav_filename = self.metadata.iloc[index]['wav'] audio, _ = load_wav(f'{self.path}/wavs/{wav_filename}.wav') if self.text_transforms: text = self.text_transforms(text) if self.audio_transforms: audio = self.audio_transforms(audio) return text, audio
def _process_utterance(out_dir, out_path, wav_path, text, stft): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * 0.999 #stft = audio.taco_stft() # delete the silence in back of the audio file. wav = librosa.effects.trim(wav, top_db=23, frame_length=1024, hop_length=256)[0] # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav, stft).numpy().astype(np.float32) # Write the spectrograms to disk: # spectrogram_filename = 'ljspeech-spec-%05d.npy' % index parts = out_path.strip().split('/') mel_filename = parts[4] + parts[5] + parts[6] o_path = os.path.join(parts[0], parts[1], parts[4]) # print(o_path) # mel_filename = 'nam_speech-mel-%05d.npy' % index # print(out_path) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[5]) if (not os.path.exists(o_path)): os.mkdir(o_path) o_path = os.path.join(o_path, parts[6]) np.save(o_path, mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: # return (spectrogram_filename, mel_filename, n_frames, text) return (mel_filename, n_frames, text)
def cache_spectrograms(self): wav_filenames = self.metadata['wav'] spectrograms_path = f'{self.path}/spectrograms' if not os.path.exists(spectrograms_path): os.makedirs(spectrograms_path) print('Building Cache..') for name in tqdm(wav_filenames, total=len(wav_filenames)): audio, _ = load_wav(f'{self.path}/wavs/{name}.wav') S = self.audio_transforms(audio) np.save(f'{spectrograms_path}/{name}.npy', S)
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration( y=wav, sr=sr) < hparams.min_audio_length: return None # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] spectrogram_filename = 'spec-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, text, wav_path, speaker_id=None): # check whether singlespeaker_mode if speaker_id is None: return _process_utterance_single(out_dir, text, wav_path) # modified version of VCTK _process_utterance sr = hparams.sample_rate # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0] + '.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] # case if wave files across different speakers have the same naming format. # e.g. Recording0.wav spectrogram_filename = 'spec-{}-{}.npy'.format(speaker_id, wav_name) mel_filename = 'mel-{}-{}.npy'.format(speaker_id, wav_name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def __getitem__(self, index): text = self.metadata.iloc[index]['text'] filename = self.metadata.iloc[index]['wav'] if self.text_transforms: text = self.text_transforms(text) if self.cache: audio = np.load(f'{self.path}/spectrograms/{filename}.npy') return text, audio audio, _ = load_wav(f'{self.path}/wavs/{filename}.wav') if self.audio_transforms: audio = self.audio_transforms(audio) return text, audio
def main(): #这一部分用于处理LibriSpeech格式的数据集。 for first_dir in os.listdir(wav_dir): for second_dir in os.listdir(os.path.join(wav_dir, first_dir)): for third_dir in os.listdir( os.path.join(os.path.join(wav_dir, first_dir), second_dir)): third_mfcc_dir = os.path.join( os.path.join(os.path.join(mfcc_dir, first_dir), second_dir), third_dir) third_mel_dir = os.path.join( os.path.join(os.path.join(mel_dir, first_dir), second_dir), third_dir) third_spec_dir = os.path.join( os.path.join(os.path.join(spec_dir, first_dir), second_dir), third_dir) third_wav_dir = os.path.join( os.path.join(os.path.join(wav_dir, first_dir), second_dir), third_dir) #print('Now in the '+mfcc_dir+' from '+ third_wav_dir) if not os.path.exists(third_mfcc_dir): os.makedirs(third_mfcc_dir) wav_files = [ os.path.join(third_wav_dir, f) for f in os.listdir(third_wav_dir) if f.endswith('.wav') ] print('Extracting MFCC from {} to {}...'.format( third_wav_dir, third_mfcc_dir)) cnt = 0 for wav_f in wav_files: wav_arr = load_wav(wav_f, sr=hparams['sample_rate']) mfcc_feats = wav2unnormalized_mfcc(wav_arr) mel_feats = wav2normalized_db_mel(wav_arr) spec_feats = wav2normalized_db_spec(wav_arr) save_name = wav_f.split('/')[-1].split('.')[0] + '.npy' mfcc_save_name = os.path.join(third_mfcc_dir, save_name) mel_save_name = os.path.join(third_mel_dir, save_name) spec_save_name = os.path.join(third_spec_dir, save_name) np.save(mfcc_save_name, mfcc_feats) np.save(mel_save_name, mel_feats) np.save(spec_save_name, spec_feats) cnt += 1 print(cnt) # break # break # break # break # 提取完毕以后,需要手动将3个文件夹的东西mv到同一个,和ppg一样的2338个文件夹 return
def process_video_file(vfile, args, split): video_stream = cv2.VideoCapture(vfile) frames = [] while 1: still_reading, frame = video_stream.read() if not still_reading: video_stream.release() break frames.append(frame) mid_frames = [] ss = 0. es = (ss + (window_size / 1000.)) while int(es * fps) <= len(frames): mid_second = (ss + es) / 2. mid_frames.append(frames[int(mid_second * fps)]) ss += (video_step_size_in_ms / 1000.) es = (ss + (window_size / 1000.)) dst_subdir = path.join( vfile.split('/')[-2], vfile.split('/')[-1].split('.')[0]) fulldir = path.join(args.final_data_root, split, dst_subdir) os.makedirs(fulldir, exist_ok=True) wavpath = path.join(fulldir, 'audio.wav') command = template.format(vfile, sr, wavpath) subprocess.call(command, shell=True) specpath = path.join(fulldir, 'mels.npz') if path.isfile(wavpath): wav = audio.load_wav(wavpath, sr) spec = audio.melspectrogram(wav) np.savez_compressed(specpath, spec=spec) else: return for i, f in enumerate(mid_frames): face, valid_frame = face_detect(f) if not valid_frame: continue resized_face = cv2.resize(face, (args.img_size, args.img_size)) cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), resized_face)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate filename = os.path.basename(wav_path).replace('.wav', '') # Load the audio to a numpy array: wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=25) # Librosa trim seems to cut off the ending part of speech else: wav, _ = librosa.effects.trim(wav, top_db=15) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Save trimmed wav save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path) dir = os.path.dirname(save_wav_path) if not os.path.exists(dir): os.system('mkdir {} -p'.format(dir)) audio.save_wav(wav, save_wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}-spec.npy'.format(filename) mel_filename = '{}-mel.npy'.format(filename) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
def testing_performance(orig, synth, rate): # Naming wavs synth_ts = synth[:-4] + '_' + str(rate) + '.wav' librosa_synth_ts_name = synth_ts[:-4] + '_librosa.wav' # Load wavs to be compared orig = audio.load_wav(orig) synth = audio.load_wav(synth) synth_ts = audio.load_wav(synth_ts) # Compare results to Librosa timestretch librosa_synth_ts = librosa.effects.time_stretch(synth, rate) audio.save_wav(librosa_synth_ts, librosa_synth_ts_name) # Calculate MSE measure between then min_len = min(len(orig) , len(synth)) model_err = ((orig[:min_len] - synth[:min_len]) ** 2).mean() min_len = min(len(librosa_synth_ts) , len(synth_ts)) ts_err = ((librosa_synth_ts[:min_len] - synth_ts[:min_len]) ** 2).mean() print('model reconstruction error {:.4f}'.format(model_err)) print('time stretching error {:.4f}'.format(ts_err))
def get_crp(filestr, window_length = None, threshold = None, use_matlab = False): """ Extract crp. """ if not use_matlab: return audio.load_chroma(filestr, crp = True, window_length = window_length, threshold = threshold).T data, info = audio.load_wav(filestr) commands = ["crp", data, info['fs']] if window_length is not None: commands.append(window_length) if threshold is not None: commands.append(threshold) return ipc.get_response(commands)
page.update(x, state[x]) self.aserver.write_buf(self.root_page.render(force=True)) # Update LED state. # State is stored in in column-major order so we have to transpose into # row-major order to suit /grid/led/map m = ([1 << self.selected_dur, 1 << self.selected_var] + [sum(((col >> nrow) & 1) << ncol for ncol, col in enumerate(state)) for nrow in range(ROWS - 2)]) self.led_map(0, 0, m) aserver = audio.AudioServer(BPM) aserver.start() instruments = [audio.load_wav('samples/%d.wav' % p) for p in range(1, 8)] app = Drilldown(aserver, instruments, monome.find_any_monome()) app.start() app.led_all(0) try: while True: time.sleep(5) except KeyboardInterrupt: app.led_all(0) app.close()