def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, labels_path, text): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples: return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, fname, wav_path, text, label): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'emotion-spec-%05d.npy' % index mel_filename = 'emtion-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (fname, spectrogram_filename, mel_filename, n_frames, text, label)
def preprocess(data_dir, normalized_dir): # Ingest the text file: #normalize(data_dir, normalized_dir) wav_files = sorted(Path(data_dir).glob('**/*.wav')) train_tuples = [] for wav_file in wav_files: # Get the transcript text_filepath = (wav_file.parent / ('-'.join(wav_file.stem.split('-')[:-1]) + '.trans.txt')) text = '' with text_filepath.open() as text_file: for line in text_file: split_line = line.split() if split_line[0] == wav_file.name: text = ' '.join(split_line[1:]) break wav = audio.load_wav(str(wav_file)) spectrogram = audio.spectrogram(wav).astype(np.float32) spectrogram_filename = (wav_file.stem + '_spectrogram') mel_spectrogram_filename = (wav_file.stem + '_mel_spectrogram') mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) np.save(os.path.join(normalized_dir,spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(normalized_dir,mel_spectrogram_filename), mel_spectrogram.T, allow_pickle=False) train_tuples.append((spectrogram_filename, mel_spectrogram_filename, spectrogram.shape[1], text)) print('DOne??') with open(os.path.join(normalized_dir, 'train.txt'), 'w') as train_file: for train_tuple in train_tuples: train_file.write('|'.join([str(x) for x in train_tuple]) + '\n')
def _process_utterance(out_dir, index, wav_path, text): '''Pre-processa um unico par de audio / texto de enunciado. Isso grava os espectrogramas mel e linear em disco e retorna uma tupla para gravar no arquivo train.txt. Args: out_dir: O diretorio para gravar os espectrogramas index: O indice numerico a ser usado nos nomes de arquivos do espectrograma. wav_path: Caminho para o arquivo de audio que contem a entrada de fala text: O texto falado no arquivo de audio de entrada Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Carregue o áudio em uma matriz numpy: wav = audio.load_wav(wav_path) # Calcule o espectrograma em escala linear a partir do wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Calcule um espectrograma em escala de mel a partir do wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Escreva os espectrogramas no disco: spectrogram_filename = 'ptbr-spec-%05d.npy' % index mel_filename = 'ptbr-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Retorne uma tupla descrevendo este exemplo de treinamento: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # rescale wav for unified measure for all clips wav = wav / np.abs(wav).max() * 0.999 # trim silence wav = audio.trim_silence(wav) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'biaobei-spec-%05d.npy' % index mel_filename = 'biaobei-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'nawar-spec-%05d.npy' % index mel_filename = 'nawar-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin): # 读取语音 wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * 0.999 # 消除静音 wav = audio.trim_silence(wav) # 得到语音的线性频谱和梅尔频谱 spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # 保存两种频谱 spectrogram_filename = 'biaobei-spec-%05d.npy' % index mel_filename = 'biaobei-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, pinyin)
def _process_utterance(out_dir, index, wav_path, text, pml_cmp, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file pml_cmp: One dimensional array containing vocoder features read from .cmp file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Write the PML features to disk pml_filename = 'nick-pml-%05d.npy' % index pml_dimension = hparams.pml_dimension pml_features = pml_cmp.reshape((-1, pml_dimension)) pml_frames = pml_features.shape[0] np.save(os.path.join(out_dir, pml_filename), pml_features, allow_pickle=False) # Remove silence from the wav # silence_removed = audio.remove_silence(wav) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] # Ensure lengths of spectrograms and PML features are the same if n_frames > pml_frames: spectrogram = spectrogram[:, :pml_frames] # Check the shape of the mel target if mel_frames > pml_frames: mel_spectrogram = mel_spectrogram[:, :pml_frames] # Write the spectrograms to disk: spectrogram_filename = 'nick-spec-%05d.npy' % index mel_filename = 'nick-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pml_filename, pml_frames, text)
from util import audio point_of_interest = '001' # read from Nick CMP = np.fromfile( f'/home/josh/tacotron/Nick/pml/herald_{point_of_interest}_1.cmp', dtype=np.float32) # pml_features = nick_parameters.reshape() f = open(f'/home/josh/tacotron/Nick/txt/herald_{point_of_interest}_1.txt', 'r') # Load the audio to a numpy array: wav = audio.load_wav( f'/home/josh/tacotron/Nick/wav/herald_{point_of_interest}_1.wav') # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # try to reshape the Nick PML features into an 86 x something matrix # pml_dimension = 86 pml_dimension = 163 pml_features = CMP.reshape((-1, pml_dimension)) # Get the saved spectrogram linear_target = np.load(f'/home/josh/tacotron/training/nick-spec-00605.npy') print(f.read()) print('PML Feature Info', CMP.shape, CMP.size / pml_dimension, pml_features.shape) print('Audio Info', spectrogram.shape, linear_target.shape, wav.shape)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None # M-AILABS extra silence specific wav = audio.trim_silence(wav) #Pre-emphasize wav = audio.preemphasis(wav) #rescale wav #wav = wav / np.abs(wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): #raise RuntimeError('wav has invalid value: {}'.format(wav)) print('file {} has invalid value. skipping!'.format(wav_path)) return None #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_frame_num: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.spectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad, hop_size = audio.librosa_pad_lr(wav) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * hop_size #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * hop_size] assert len(out) % hop_size == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(out_dir, fid, wav_path, label_path, phn_seq, phn2idx_map, frame_phn_idx, trim_silence, audio_config, f0_config): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into fid: utterance id. wav_path: Path to the audio file containing the speech input label_path: Path to full label file. phn_seq: text-phn-seq from ppp file. phn2idx_map: map phn (str) to index (int). frame_phn_idx: frame-level text-phn positions. uttid: utterance id. trim_silence: audio_config: f0_config: Returns: tuple (expanded phn-seq length, fid) ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, audio_config.sample_rate) # Trim Slience if trim_silence: start_offset, end_offset = _parse_labels(label_path) start_offset_frames = int(start_offset / (audio_config.frame_shift_ms / 1000.0)) end_offset_frames = int(end_offset / (audio_config.frame_shift_ms / 1000.0)) else: start_offset_frames = 0 end_offset_frames = 1000000 # Compute a mel-scale spectrogram for wavenet, wavernn mel_spectrogram = melspectrogram( wave=wav, sample_rate=audio_config.sample_rate, num_freq=audio_config.num_freq, num_mels=audio_config.num_mels, frame_length_ms=audio_config.frame_length_ms, frame_shift_ms=audio_config.frame_shift_ms, pre_emphasis=audio_config.pre_emphasis, fmin=audio_config.fmin, min_level_db=audio_config.min_level_db, ref_level_db=audio_config.ref_level_db).astype(np.float32) mel_spectrogram = mel_spectrogram.T # -> [T, num_mels] stft = spectrogram(wave=wav, sample_rate=audio_config.sample_rate, num_freq=audio_config.num_freq, frame_length_ms=audio_config.frame_length_ms, frame_shift_ms=audio_config.frame_shift_ms, pre_emphasis=audio_config.pre_emphasis, min_level_db=audio_config.min_level_db, ref_level_db=audio_config.ref_level_db).astype( np.float32) stft = stft.T # -> [T, num_freq] cont_f0 = compute_f0_from_wav(speaker_name='song_guiniang', wave_path=wav_path, f0_config=f0_config) if len(cont_f0) > len(mel_spectrogram): cont_f0 = cont_f0[:len(mel_spectrogram)] elif len(cont_f0) < len(mel_spectrogram): pad_len = len(mel_spectrogram) - len(cont_f0) cont_f0 = np.pad(cont_f0, (0, pad_len), mode="constant", constant_values=cont_f0[-1]) else: pass # Process phn sequence phn_seq_idx = np.asarray([phn2idx_map[i] for i in phn_seq]) dur = np.asarray([ len(list(group)) for (key, group) in itertools.groupby(frame_phn_idx) ]) pos_list = [key for (key, group) in itertools.groupby(frame_phn_idx)] txt_phnseq = np.array(phn_seq)[pos_list] # Cut off tailing silence cut_len = mel_spectrogram.shape[0] - sum(dur) if cut_len > 0: mel_spectrogram = mel_spectrogram[:-cut_len] stft = stft[:-cut_len] cont_f0 = cont_f0[:-cut_len] else: assert dur[-1] > -cut_len, "Not enough tailing silence." dur[-1] = dur[-1] + cut_len # Save features with open(f"{out_dir}/{fid}.phntone.txt", 'w') as f: f.write(' '.join(txt_phnseq) + '\n') np.save(f'{out_dir}/{fid}.mel.npy', mel_spectrogram, allow_pickle=False) np.save(f'{out_dir}/{fid}.stft.npy', stft, allow_pickle=False) np.save(f'{out_dir}/{fid}.dur.npy', dur, allow_pickle=False) np.save(f'{out_dir}/{fid}.f0.npy', cont_f0, allow_pickle=False) return len(frame_phn_idx), fid