def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) # mel_target = np.load(meta[1]) # audio-P00001A-001-20170001P00001A0001_00.npy # E:/data/stcmds/SV2TTS/synthesizer/audio/audio-P00001A-001-20170001P00001A0001_00.npy # wav_fpath = Path(r'E:\data\stcmds\stcmds\wavs') parts = Path(meta[0]).parts name_parts = parts[-1].split('-') name_parts[-1] = name_parts[-1].split('_')[0] + '.wav' wav_fpath = Path(*parts[:3], parts[2], 'wavs', *name_parts[1:]) wav = load_wav(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max mel_target = melspectrogram(wav, hparams).T #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) embed_target = np.load(meta[2]) return input_data, mel_target, token_target, embed_target, len( mel_target)
def _get_test_groups(self): meta = self._test_meta[self._test_offset] self._test_offset += 1 text = meta[5] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) # mel_target = np.load(meta[1]) # os.path.join(self._mel_dir, meta[1]) parts = Path(meta[0]).parts name_parts = parts[-1].split('-') name_parts[-1] = name_parts[-1].split('_')[0] + '.wav' wav_fpath = Path(*parts[:3], parts[2], 'wavs', *name_parts[1:]) wav = load_wav(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max mel_target = melspectrogram(wav, hparams).T #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) embed_target = np.load( meta[2]) # os.path.join(self._embed_dir, meta[2]) return input_data, mel_target, token_target, embed_target, len( mel_target)
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav = load_preprocess_wav(fpath_or_wav) else: wav = fpath_or_wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) return mel_spectrogram
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams): ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. # - Both the audios and the mel spectrograms are saved as numpy arrays # - There is no processing done to the audios that will be saved to disk beyond volume # normalization (in split_on_silences) # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This # is why we re-apply it on the audio on the side of the vocoder. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) ppg_fpath = out_dir.joinpath("ppgs", "ppg-%s.npy" % basename) if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # Skip utterances that are too short if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: return None # Compute the mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] # Skip utterances that are too long if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute ppg wav_ppg = (wav * 32767).astype(np.int16) if hparams.use_full_ppg: ppg = audio.get_ppg(wav_ppg, hparams.sample_rate, hparams.hop_size / hparams.sample_rate * 1000) else: ppg = audio.get_monophone_ppg( wav_ppg, hparams.sample_rate, hparams.hop_size / hparams.sample_rate * 1000) ppg_frames = ppg.shape[0] # Sometimes ppg can be 1 frame longer than mel min_frames = min(mel_frames, ppg_frames) mel_spectrogram = mel_spectrogram[:, :min_frames] ppg = ppg[:min_frames, :] # Write the spectrogram, embed, ppg and audio to disk np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) np.save(wav_fpath, wav, allow_pickle=False) np.save(ppg_fpath, ppg, allow_pickle=False) # Return a tuple describing this training example return wav_fpath.name, mel_fpath.name, ppg_fpath.name, "embed-%s.npy" % basename, len( wav), min_frames, text
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): """ Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that were fed to the synthesizer when training. """ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav = Synthesizer.load_preprocess_wav(fpath_or_wav) else: wav = fpath_or_wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) return mel_spectrogram
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams): ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. # - Both the audios and the mel spectrograms are saved as numpy arrays # - There is no processing done to the audios that will be saved to disk beyond volume # normalization (in split_on_silences) # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This # is why we re-apply it on the audio on the side of the vocoder. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # rescale if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # denoise LogMMSE #from utils import logmmse #wav = logmmse.denoise(wav, profile, eta=0) # VAD process from encoder.audio import trim_long_silences, normalize_volume wav = normalize_volume(wav, -30, increase_only=True) wav = trim_long_silences(wav) # Skip utterances that are too short if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: #print("too short!") return None # Compute the mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] # Skip utterances that are too long if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Write the spectrogram, embed and audio to disk np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) np.save(wav_fpath, wav, allow_pickle=False) # Return a tuple describing this training example return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len( wav), mel_frames, text
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams): ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. # - Both the audios and the mel spectrograms are saved as numpy arrays # - There is no processing done to the audios that will be saved to disk beyond volume # normalization (in split_on_silences) # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This # is why we re-apply it on the audio on the side of the vocoder. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. """ wav: one wav file of one complete sentence from alignment file from split_on_silence function text: text of that wav file out_dir: where to save spcetro and audio in npy format basename: name of file skip existing if found -> required skip if sentence too short -> not required compute spectro -> required skip if sentence too long -> not required save spectro and audio -> required """ # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # Skip utterances that are too short if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: return None # Compute the mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype( np.float32) #NEED THIS mel_frames = mel_spectrogram.shape[1] # Skip utterances that are too long #DONT NEED THIS if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Write the spectrogram, embed and audio to disk np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) #NEED THIS np.save(wav_fpath, wav, allow_pickle=False) #NEED THIS # Return a tuple describing this training example return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len( wav), mel_frames, text
def process_audio_file(vfile, args, gpu_id): fulldir = vfile.replace('intervals', 'preprocessed') #windows下需要改正路径,这里win下是\\ fulldir = fulldir[:fulldir.rfind('.')] # ignore extension os.makedirs(fulldir, exist_ok=True) wavpath = path.join(fulldir, 'audio.wav') specpath = path.join(fulldir, 'mels.npz') wav = audio.load_wav(wavpath, hp.sample_rate) spec = audio.melspectrogram(wav, hp) lspec = audio.linearspectrogram(wav, hp) np.savez_compressed(specpath, spec=spec, lspec=lspec)
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, skip_existing: bool, hparams, random_uttBasename_forSpkEmbedding=None): ''' random_uttBasename_forSpkEmbedding: if not None, use the utterance to generate speaker embedding in synthesizer training. ''' ## FOR REFERENCE: # For you not to lose your head if you ever wish to change things here or implement your own # synthesizer. # - Both the audios and the mel spectrograms are saved as numpy arrays # - There is no processing done to the audios that will be saved to disk beyond volume # normalization (in split_on_silences) # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This # is why we re-apply it on the audio on the side of the vocoder. # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved # without extra padding. This means that you won't have an exact relation between the length # of the wav and of the mel spectrogram. See the vocoder data loader. # Skip existing utterances if needed mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) if skip_existing and mel_fpath.exists() and wav_fpath.exists(): return None # Skip utterances that are too short if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: return None # Compute the mel spectrogram mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] # Skip utterances that are too long if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Write the spectrogram, embed and audio to disk np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) np.save(wav_fpath, wav, allow_pickle=False) # Return a tuple describing this training example embed_basename = basename if random_uttBasename_forSpkEmbedding is not None: embed_basename = random_uttBasename_forSpkEmbedding return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % embed_basename, len( wav), mel_frames, text
def process_video_file(vfile, args, gpu_id): video_stream = cv2.VideoCapture(vfile) frames = [] while 1: still_reading, frame = video_stream.read() if not still_reading: video_stream.release() break frames.append(frame) fulldir = vfile.replace('/intervals/', '/preprocessed/') fulldir = vfile[:vfile.rfind('.')] # ignore extension os.makedirs(fulldir, exist_ok=True) wavpath = path.join(fulldir, 'audio.wav') specpath = path.join(fulldir, 'mels.npz') command = template.format(vfile, hp.sample_rate, wavpath) subprocess.call(command, shell=True) wav = audio.load_wav(wavpath, hp.sample_rate) spec = audio.melspectrogram(wav, hp) lspec = audio.linearspectrogram(wav, hp) np.savez_compressed(specpath, spec=spec, lspec=lspec) batches = [ frames[i:i + args.batch_size] for i in range(0, len(frames), args.batch_size) ] i = -1 for fb in batches: preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb)) for j, f in enumerate(preds): i += 1 if f is None: continue cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), f[0])
def wav2mel(wav): return melspectrogram(wav, hparams)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Pre-emphasize wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav)) #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) embed_filename = 'embed-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, embed_filename, time_steps, mel_frames, text)
e1 = encoder.embed_utterance(source_wav) e1 = e1[np.newaxis, :, np.newaxis] e1=torch.tensor(e1) for t in target: target_wav_name = os.listdir(en_path + t + "/wavs") embedding_tr = 0 for i in range(10): target_name = target_wav_name[i] target_wav_fpath = en_path +t+"/wavs"+"/"+ target_name target_wav = encoder.preprocess_wav(target_wav_fpath) e2 = encoder.embed_utterance(target_wav) embedding_tr = embedding_tr+ e2 embedding_tr /=10 print(embedding_tr.shape) mel = audio.melspectrogram(source_wav, hparams) mel = pad_seq(mel.T).T mel = torch.from_numpy(mel[None, ...]) embedding_tr = embedding_tr[np.newaxis, :, np.newaxis] embedding_tr =torch.tensor(embedding_tr) mel,e1,embedding_tr = mel.cuda(),e1.cuda(),embedding_tr.cuda() #print("mel shape:",mel.shape) #print("e1 shape:",e1.shape) #print("e2 shape:",e2.shape) C,X_C,X_before,X_after,_ = model(mel, e1, embedding_tr) mel_out = torch.tensor(X_after).clone().detach().cpu().numpy() #print("mel_out shape:",mel_out.shape) if use_wavrnn: wav = vocoder_wavrnn.infer_waveform(mel_out[0,0,:,:].T)