def main(): print("trim audio beggin....") input = os.path.join("/Users/zhuribing/Documents", "p225_028.wav") output = os.path.join("/Users/zhuribing/Documents", "trim_p225_028.wav") wav = audio.load_wav(input, sr=16000) wav = audio.trim_silence(wav, hparams) audio.save_wav(wav, output, 16000)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ <<<<<<< HEAD Preprocesses a single utterance wavs/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wavs into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns:rescaling_max - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wavs exception print('file {} present in csv metadata is not present in wavs folder. skipping!'.format( wav_path)) return None # rescale wavs if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, speaker_num, lan_num, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Get spectrogram from wav ret = audio.wav2spectrograms(wav, hparams) if ret is None: return None out, mel_spectrogram, linear_spectrogram, time_steps, mel_frames = ret # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(np.float32), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text, speaker_num, lan_num)
def get_second_part_wave(wav, start_time, end_time, hparams): start_time = int(start_time * 1000) end_time = int(end_time * 1000) sentence = wav[start_time:end_time] temp = sentence.export('temp.wav', format="wav") sentence = audio.load_wav('temp.wav', sr=hparams.sample_rate) return sentence
def _process_wave(self, wav_file, num_frames): try: wav = audio.load_wav(wav_file, sr=audio_hparams.sample_rate) except FileNotFoundError: print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_file)) if audio_hparams.trim_silence: wav = audio.trim_silence(wav, audio_hparams) expect_len = num_frames * audio_hparams.hop_size + audio_hparams.win_size if len(wav) < expect_len: wav = np.concatenate([wav] * np.math.ceil(expect_len / len(wav))) if len(wav) > expect_len: sp = random.randint(0, len(wav) - expect_len) wav = wav[sp:sp + expect_len] wav = audio.preemphasis(wav, audio_hparams.preemphasis, audio_hparams.preemphasize) if audio_hparams.rescale: wav = wav / np.abs(wav).max() * audio_hparams.rescaling_max mels = audio.melspectrogram(wav, audio_hparams).astype(np.float32).T return mels
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except: print('file {} present in csv not in folder'.format(wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.trim_silence: wav = audio.trim_silence(wav) out = mulaw_quantize(wav, hparams.quantize_channels) start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] assert linear_frames == mel_frames l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def main(): process_type = '1' input_path = './wav_files/sample.wav' output_path = input_path.split( '.wav')[0] + '_enhanced_mmse' + process_type + '.wav' wav, sample_rate = audio.load_wav(input_path, sr=None) wav = wav / np.abs(wav).max() * 0.999 clean_wav = speech_enhancement.mmse_stsa(wav, sample_rate=sample_rate, process_type=process_type) # Plot result plt.figure() plt.subplot(2, 1, 1) librosa.display.waveplot(wav, sr=sample_rate) plt.title('Noisy Time Signal') plt.subplot(2, 1, 2) librosa.display.waveplot(clean_wav, sr=sample_rate) plt.title('Estimated Clean Time Signal') plt.figure() plt.subplot(2, 1, 1) librosa.display.specshow(librosa.power_to_db( librosa.feature.melspectrogram(y=wav, sr=sample_rate, n_fft=1024, hop_length=512), ref=np.max), sr=sample_rate, x_axis='time', y_axis='linear') plt.title('Noisy Spectrogram') plt.colorbar(format='%+2.0f dB', boundaries=np.linspace(-70, 0, 10)) plt.subplot(2, 1, 2) librosa.display.specshow(librosa.power_to_db( librosa.feature.melspectrogram(y=clean_wav, sr=sample_rate, n_fft=1024, hop_length=512), ref=np.max), sr=sample_rate, x_axis='time', y_axis='linear') plt.title('Estimated Clean Spectrogram') plt.colorbar(format='%+2.0f dB', boundaries=np.linspace(-70, 0, 10)) plt.tight_layout() plt.show() clean_wav *= 32767 / max(0.01, np.max(np.abs(clean_wav))) # proposed by @dsmiller wavfile.write(output_path, sample_rate, clean_wav.astype(np.int16))
def _process_utterance_libri(mel_dir, label_dir, index, wav_path, label, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - label_dir: the directory to write the label into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - label: time steps spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (wav_path, mel_filename, time_steps, mel_frames, label_filename) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = wav out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_spectrogram = mel_spectrogram[:, -len(label):] mel_frames = mel_spectrogram.shape[1] time_steps = len(out) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False) # Return a tuple describing this training example return (wav_path, mel_filename, time_steps, mel_frames, label_filename)
def vad(): print("trim audio beggin....") dataset_root = Path("/Users/zhuribing/Project/AccelerateServerTest/audio") files = list(dataset_root.joinpath("org").glob("*")) print(len(files)) cnt = 0 for input in files: output = str(input).replace("org", "trim", 1) wav = audio.load_wav(input, sr=16000) wav = audio.trim_silence(wav, hparams) audio.save_wav(wav, output, 16000) cnt = cnt + 1 if (cnt % 10 == 0): print("complete:", cnt)
def _process_utterance(feat_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, hparams) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # feature extraction feature = audio.feature_extract(wav, hparams) n_frames = len(feature) if n_frames > hparams.max_frame_num or len(text) > hparams.max_text_length: return None feat_file = '{}.npy'.format(index) np.save(os.path.join(feat_dir, feat_file), feature, allow_pickle=False) # Return a tuple describing this training example return (feat_file, n_frames, text)
def run_eval(args, checkpoint_path, output_dir, hparams,synth): if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) # Create output path if it doesn't exist if args.reference_audio_path is not None: print('reference_audio:', args.reference_audio_path) ref_wavs = os.listdir(args.reference_audio_path) else: if hparams.use_style_encoder == True: print("*******************************") print("TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") else: # raise ValueError("You must set the reference audio if you don't want to use GSTs.") print("233") # Set inputs batch wise counter=0 fault_ppgs=np.zeros((1,2,345),dtype=np.float32) for ref_wav in ref_wavs: speaker = ref_wav.split('_')[0] sentence = ref_wav.split('_')[1] save_path = output_dir+'/'+speaker+'/'+sentence if not os.path.exists(save_path): os.makedirs(save_path) counter=counter+1 ref_wav_name = os.path.join(args.reference_audio_path,ref_wav) save_name = os.path.join(save_path+'/'+ref_wav.split('.')[0]+'.npy') ref_wav = load_wav(ref_wav_name, hparams.sample_rate) reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T style_embedding = synth.synthesize_embedding(fault_ppgs,[reference_mel])[0] np.save(save_name,style_embedding) print(str(counter)+'/'+str(len(ref_wavs)))
def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio,sr=hparams.sample_rate) reference_mel = audio.melspectrogram(ref_wav,hparams).astype(np.float32).T else: #raise ValueError("Evaluation without reference audio. Please provide path to reference audio.") reference_mel = None synth.load(checkpoint_path, hparams, reference_mel=reference_mel) #Set inputs batch wise sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(sentences)): start = time.time() basenames = ['batch_{:03d}_sentence_{:03d}'.format(i, j) for j in range(len(texts))] mel_filenames, speaker_ids = synth.synthesize(texts, basenames, eval_dir, log_dir, None, reference_mel=reference_mel) for elems in zip(texts, mel_filenames, speaker_ids): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def run_eval(args, checkpoint_path, output_dir, hparams, ppgs, speakers, Lf0s): eval_dir = os.path.join(output_dir, 'eval') log_dir = os.path.join(output_dir, 'logs-eval') if args.model == 'Tacotron-2': assert os.path.normpath(eval_dir) == os.path.normpath(args.mels_dir) #Create output path if it doesn't exist os.makedirs(eval_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(os.path.join(log_dir, 'wavs'), exist_ok=True) os.makedirs(os.path.join(log_dir, 'plots'), exist_ok=True) log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, reference_mels=args.reference_audio) if args.reference_audio is not None: print('reference_audio:', args.reference_audio) ref_wav = load_wav(args.reference_audio.strip(), hparams.sample_rate) reference_mel = melspectrogram(ref_wav, hparams).astype(np.float32).T else: if hparams.use_style_encoder == True: print("*******************************") print( "TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") else: #raise ValueError("You must set the reference audio if you don't want to use GSTs.") print("233") #Set inputs batch wise ppgs = [ ppgs[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(ppgs), hparams.tacotron_synthesis_batch_size) ] Lf0s = [ Lf0s[i:i + hparams.tacotron_synthesis_batch_size] for i in range(0, len(Lf0s), hparams.tacotron_synthesis_batch_size) ] if args.reference_audio is not None: reference_mels = [reference_mel] * len(ppgs) log('Starting Synthesis') with open(os.path.join(eval_dir, 'map.txt'), 'w') as file: for i, texts in enumerate(tqdm(ppgs)): start = time.time() basenames = [ 'batch_{}_sentence_{}'.format(i, j) for j in range(len(texts)) ] if args.reference_audio is not None: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, [reference_mels[i]], Lf0s[i]) else: mel_filenames = synth.synthesize(texts, [speakers[i]], basenames, eval_dir, log_dir, None, None, Lf0s[i]) for elems in zip(texts, mel_filenames, [speakers[i]]): file.write('|'.join([str(x) for x in elems]) + '\n') log('synthesized mel spectrograms at {}'.format(eval_dir)) return eval_dir
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x): """ Preprocesses the speech dataset from a gven input path to given output directories Args: - hparams: hyper parameters - input_dirs: input directory that contains the files to prerocess - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset - label_dir: the directory to write the label into - tqdm: Optional, provides a nice progress bar Returns: - A list of tuple describing the train examples. this should be written to train.txt """ # We use ProcessPoolExecutor to parallelize across processes, this is just for # optimization purposes and it can be omited futures = [] index = 1 for input_dir in input_dirs: files = find_files(os.path.join(input_dir)) for wav_path in files: file_name = wav_path.split("\\")[-1] if int(file_name.split('.')[0]) <= 10: label_path = wav_path.split("\\")[0] + '/label.txt' with open(label_path, encoding='utf-8') as f: lines = f.readlines() for line in lines: if file_name in line: labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',') start = [] end = [] for idx in range(0, len(labels), 2): start.append(int(labels[idx])) end.append(int(labels[idx+1])) try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # [-1, 1] out = wav out_dtype = np.float32 if int(file_name.split('.')[0]) <= 10: label = np.zeros_like(out) for idx in range(len(start)): start[idx] = int(start[idx] / 1000 * hparams.sample_rate) end[idx] = int(end[idx] / 1000 * hparams.sample_rate) label[start[idx]:end[idx]] = 1. else: label = wav_path.split('.')[0] + '.label' with open(label, encoding='utf-8') as f: lines = f.readlines() label = np.asarray([int(line.strip('\n')) for line in lines]) # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_spectrogram = mel_spectrogram[:, -len(label):] mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) if int(file_name.split('.')[0]) <= 10: # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') label = np.pad(label, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] label = label[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 label = label[::audio.get_hop_size(hparams)] time_steps = len(out) else: time_steps = len(out) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False) futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename)) index += 1 return [future for future in tqdm(futures)]
def _process_utterance(mel_dir, linear_dir, wav_dir, spkid, uttid, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk sub_wav_dir = os.path.join(wav_dir, spkid) sub_mel_dir = os.path.join(mel_dir, spkid) sub_linear_dir = os.path.join(linear_dir, spkid) os.makedirs(sub_wav_dir, exist_ok=True) os.makedirs(sub_mel_dir, exist_ok=True) os.makedirs(sub_linear_dir, exist_ok=True) audio_filename = 'audio-{}.npy'.format(uttid) mel_filename = 'mel-{}.npy'.format(uttid) linear_filename = 'linear-{}.npy'.format(uttid) np.save(os.path.join(sub_wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(sub_mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(sub_linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (spkid, audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' mel_dir = out_dir + "/mels" linear_dir = out_dir + "/linear" wav_dir = out_dir + "/audio" # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) print("debug wav_path:", wav_path) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the wav: #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames) return None # Compute the linear-scale spectrogram from the wav: #spectrogram = audio.spectrogram(wav).astype(np.float32) #n_frames = spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrograms to disk: #spectrogram_filename = 'thchs30-spec-%05d.npy' % index #mel_filename = 'thchs30-mel-%05d.npy' % index #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) print("debug save wav file:", os.path.join(wav_dir, audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, pinyin)
def _process_utterance(mel_dir, index, wav_path, start, end, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - start, end: start, end points of speech - hparams: hyper parameters Returns: - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None start += 1 * hparams.sample_rate end += 1 * hparams.sample_rate #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = wav out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) start = round(start/int(time_steps / mel_frames)) end = round(end/int(time_steps / mel_frames)) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (wav_path, mel_filename, time_steps, mel_frames, start, end)
def _process_utterance(lf0_dir, mgc_dir, bap_dir, cmp_dir, linear_dir, basename, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - basename: - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ if hparams.trim_silence: tar_wavfile = wav_path[:-4] + "_trim.wav" print("raw wav path:%s" % wav_path) wav_raw, fs = sf.read(wav_path) wav_trim = audio.trim_silence(wav_raw, hparams) sf.write(tar_wavfile, wav_trim, fs) wav_path = tar_wavfile nFFTHalf, alpha, bap_dim = audio.get_config(hparams.sample_rate) mcsize = hparams.num_mgc - 1 filename = basename #os.path.basename(wav_path).split(".")[0] print('extract feats for %s' % wav_path) # extract f0,sp,ap os.system("analysis %s %s/%s.f0 %s/%s.sp %s/%s.bapd" % (wav_path, lf0_dir, filename, mgc_dir, filename, bap_dir, filename)) # get float64??? # interpolate f0 f0 = np.fromfile("%s/%s.f0" % (lf0_dir, filename), dtype=np.float64) continuous_f0 = interp1d(f0, kind="slinear") continuous_f0.tofile("%s/%s.f0c" % (lf0_dir, filename)) # convert f0 to lf0 os.system("x2x +da %s/%s.f0c > %s/%s.f0a" % (lf0_dir, filename, lf0_dir, filename)) os.system( "x2x +af %s/%s.f0a | sopr -magic 0.0 -LN -MAGIC -1.0E+10 > %s/%s.lf0" % (lf0_dir, filename, lf0_dir, filename)) # convert sp to mgc os.system("x2x +df %s/%s.sp | sopr -R -m 32768.0 | " "mcep -a %f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q 3 " "> %s/%s.mgc" % (mgc_dir, filename, alpha, mcsize, nFFTHalf, mgc_dir, filename)) # convert ap to bap os.system("x2x +df %s/%s.bapd > %s/%s.bap" % (bap_dir, filename, bap_dir, filename)) # merge mgc,lf0 and bap to cmp os.system("merge +f -s 0 -l 1 -L %d %s/%s.mgc < %s/%s.lf0 > %s/%s.ml" % ( (mcsize + 1), mgc_dir, filename, lf0_dir, filename, cmp_dir, filename)) os.system("merge +f -s 0 -l %d -L %d %s/%s.ml < %s/%s.bap > %s/%s.cmp" % (bap_dim, (mcsize + 2), cmp_dir, filename, bap_dir, filename, cmp_dir, filename)) #if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # return None #Compute the linear scale spectrogram from the wav wav = audio.load_wav(wav_path, hparams.sample_rate) linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check #assert linear_frames == mel_frames lf0 = np.fromfile("%s/%s.lf0" % (lf0_dir, filename), dtype=np.float32) mgc = np.fromfile("%s/%s.mgc" % (mgc_dir, filename), dtype=np.float32) bap = np.fromfile("%s/%s.bap" % (bap_dir, filename), dtype=np.float32) cmp = np.fromfile("%s/%s.cmp" % (cmp_dir, filename), dtype=np.float32) cmp_dim = mcsize + 1 + 1 + bap_dim cmp_frames = cmp.shape[0] / cmp_dim #print(f0[:100]) #print(continuous_f0[:100]) print(lf0.shape) print(continuous_f0.shape) print(mgc.shape) print(bap.shape) print(cmp_frames) print(continuous_f0.dtype) print(mgc.dtype) print(bap.dtype) assert (mgc.shape[0] / (mcsize + 1)) == (continuous_f0.shape[0] / 1) == (bap.shape[0] / bap_dim) == cmp_frames assert cmp_dim == hparams.num_mels #assert len(out) >= cmp_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample #out = out[:mel_frames * audio.get_hop_size(hparams)] #assert len(out) % audio.get_hop_size(hparams) == 0 #time_steps = len(out) # Write the spectrogram and audio to disk #audio_filename = 'audio-{}.npy'.format(index) cmp_mat = cmp.reshape(-1, cmp_dim) cmp_filename = 'cmp-{}.npy'.format(basename) linear_filename = 'linear-{}.npy'.format(basename) #np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(cmp_dir, cmp_filename), cmp_mat, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (cmp_filename, linear_filename, cmp_frames, text)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(',')] # Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario) while len(texts) % hparams.tacotron_synthesis_batch_size != 0: texts.append(texts[-1]) basenames.append(basenames[-1]) if mel_filenames is not None: mel_filenames.append(mel_filenames[-1]) assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus # Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] # pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][1] = \ max_target_len # Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) if self.style_transfer and hparams.tacotron_style_reference_audio is not None and\ hparams.tacotron_style_alignment is None: # only support one style reference audio if hparams.tacotron_style_reference_audio[-4:] == '.wav': wav = audio.load_wav(hparams.tacotron_style_reference_audio, sr=hparams.sample_rate) np_targets = audio.melspectrogram(wav, self._hparams).astype( np.float32).T else: np_targets = np.load(hparams.tacotron_style_reference_audio) target_lengths = len(np_targets) # copy np_targets = [np_targets for _ in range(len(texts))] target_lengths = [target_lengths for _ in range(len(texts))] # pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][1] = \ max_target_len # Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs feed_dict[self.target_lengths] = target_lengths assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) # Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: # Natural batch synthesis # Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) # Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) # Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] # Natural batch synthesis # Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] # Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: # Generate wav and read it wav = audio.inv_mel_spectrogram(mels.T, hparams) audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) # Find a better way chunk = 512 f = wave.open('temp.wav', 'rb') p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate() return saved_mels_paths = [] speaker_ids = [] for i, mel in enumerate(mels): # Get speaker id for global conditioning (only used with GTA generally) if hparams.gin_channels > 0: raise RuntimeError( 'Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.' ) speaker_id = '<no_g>' # set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable) speaker_ids.append( speaker_id ) # finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker) else: speaker_id = '<no_g>' speaker_ids.append(speaker_id) # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: # save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate) # save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, 'plots/alignment-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, max_len=target_lengths[i]) # save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])), title='{}'.format(texts[i]), split_title=True) if hparams.predict_linear: # save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, 'wavs/wav-{}-linear.wav'.format( basenames[i])), sr=hparams.sample_rate) # save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, 'plots/linear-{}.png'.format( basenames[i])), title='{}'.format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths, speaker_ids
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, ppgs, lf0_path, speaker, refer, hparams): """ Preprocesses a single utterance wav/ppgs pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - ppgs: ppgs spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, refer_name ,time_steps, mel_frames, linear_frames, ppgs,speaker,lf0) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, refer, time_steps, mel_frames, ppgs, speaker, lf0_path)
def _process_utterance(wav_dir, mel_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = encode_mu_law(wav, mu=512) # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames or len( text) > hparams.max_text_length: return None #Zero pad for quantized signal #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample r = mel_frames * audio.get_hop_size(hparams) - len(wav) out = np.pad(out, (0, r), mode='constant', constant_values=0.) assert len(out) == mel_frames * audio.get_hop_size(hparams) time_steps = len(out) # Write the spectrogram and audio to disk filename = '{}.npy'.format(index) np.save(os.path.join(wav_dir, filename), out.astype(np.int16), allow_pickle=False) np.save(os.path.join(mel_dir, filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance_clova(audio_dir, label_dir, index, wav_path, text_path, args): """ Preprocesses a single utterance wav/text_jamo pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text_jamo: text_jamo spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=args.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if args.rescale: wav = wav / np.abs(wav).max() * args.rescaling_max # M-AILABS extra silence specific if args.trim_silence: wav = audio.trim_silence(wav, args) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(args) # time resolution adjustment # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(args)] assert len(out) % audio.get_hop_size(args) == 0 time_steps = len(out) # text_jamo sequence with open(text_path, 'r', encoding='utf-8', newline='') as f: rdr = csv.reader(f) for x in rdr: if os.path.basename(wav_path) == x[0]: line = x[1] # ETRI transcription rule line = sentence_filter(line).upper() label_sequence = normalize(line) print(label_sequence) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.txt'.format(index) np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out: f_out.write(label_sequence) # Return a tuple describing this training example return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except : print('file {} present in csv not in folder'.format( wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav) #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ eliminated=0 try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None try: # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: new_wav = audio.trim_silence(wav, hparams) eliminated+=(len(wav)-len(new_wav))/hparams.sample_rate except Exception as e:
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out_dir: the directory to write the msgpack into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) r = hparams.outputs_per_step if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. # +2r for head and tail silence mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) target_length = len(linear_spec) target_frames = (target_length // r + 1) * r num_pad = target_frames - target_length if num_pad != 0: linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) stop_token = np.concatenate( [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)], axis=0) data = { 'mel': mel_spec, 'linear': linear_spec, 'audio': out.astype(out_dtype), 'input_data': np.asarray(text_to_sequence(text)), 'time_steps': time_steps, 'mel_frames': target_frames, 'text': text, 'stop_token': stop_token, } dumps_msgpack(data, os.path.join(out_dir, npz_filename)) # Return a tuple describing this training example return npz_filename, time_steps, mel_frames, text
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): wav = _trim_wav(audio.load_wav(wav_path, sr=hparams.sample_rate)) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) name = os.path.splitext(os.path.basename(wav_path))[0] speaker_id = _speaker_re.match(name).group(1) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, speaker_id, text)