def embed_utterance(wav, using_partials=True, return_partials=False, model=None, **kwargs): """ Computes an embedding for a single utterance. # TODO: handle multiple wavs to benefit from batching on GPU :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 :param using_partials: if True, then the utterance is split in partial utterances of <partial_utterance_n_frames> frames and the utterance embedding is computed from their normalized average. If False, the utterance is instead computed from feeding the entire spectogram to the network. :param return_partials: if True, the partial embeddings will also be returned along with the wav slices that correspond to the partial embeddings. :param kwargs: additional arguments to compute_partial_splits() :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If <return_partials> is True, the partial utterances as a numpy array of float32 of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. If <using_partials> is simultaneously set to False, both these values will be None instead. """ # Process the entire utterance if not using partials if model is None: print("\n\n\n\n\nDidn't find model, will use preloaded.\n\n\n\n\n\n") model = _model if not using_partials: frames = audio.wav_to_mel_spectrogram(wav) embed = embed_frames_batch(frames[None, ...], model)[0] if return_partials: return embed, None, None return embed # Compute where to split the utterance into partials and pad if necessary wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) max_wave_length = wave_slices[-1].stop if max_wave_length >= len(wav): wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") # Split the utterance into partials frames = audio.wav_to_mel_spectrogram(wav) frames_batch = np.array([frames[s] for s in mel_slices]) partial_embeds = embed_frames_batch(frames_batch, model) # Compute the utterance embedding from the partial embeddings raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) if return_partials: return embed, partial_embeds, wave_slices return embed
def computeEmbedding(wav, **kwargs): ''' This Method computes the embedding vector for the wav paramater PARAMS: wav:the preprocessed wav file for which the e-vector will be calculated RETURNS: the embedding of the wav object ''' # If the last slice size is larger than the length of the wav then we #must zero-pad the wav wSlices, mSlices = computeSlices(len(wav), **kwargs) lastSliceStop = wSlices[-1].stop if lastSliceStop >= len(wav): wav = np.pad(wav, (0, lastSliceStop - len(wav)), "constant") #compute the mel spectogram of the wav frames = audio.wav_to_mel_spectrogram(wav) #group every mslice into an array which will be fed to the network framesInBatches = np.array([frames[s] for s in mSlices]) #for every member in partialEmbeddings is the e-vector for the partial utterance partialEmbeddings = computeEmbeddingForBatch(framesInBatches) # The embedding vector of the complete utterance will be the normalization of the averaged version averageEmbedding = np.mean(partialEmbeddings, axis=0) embed = averageEmbedding / np.linalg.norm(averageEmbedding, 2) return embed
def preprocess_speaker(speaker_dir: Path): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = { line.split(",")[0] for line in sources_file } except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform try: wav = audio.preprocess_wav(in_fpath) except ValueError as e: # loading VoxCeleb2, this gets raised: # ValueError("frames must be specified for non-seekable files") print(f"skipping loading of {in_fpath}, because: {str(e)}") continue if len(wav) == 0: continue print(f"Processing {in_fpath}...") # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close()
def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): """ Computes an embedding for a single utterance. The utterance is divided in partial utterances and an embedding is computed for each. The complete utterance embedding is the L2-normed average embedding of the partial utterances. TODO: independent batched version of this function :param wav: a preprocessed utterance waveform as a numpy array of float32 :param return_partials: if True, the partial embeddings will also be returned along with the wav slices corresponding to each partial utterance. :param rate: how many partial utterances should occur per second. Partial utterances must cover the span of the entire utterance, thus the rate should not be lower than the inverse of the duration of a partial utterance. By default, partial utterances are 1.6s long and the minimum rate is thus 0.625. :param min_coverage: when reaching the last partial utterance, it may or may not have enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, then the last partial utterance will be considered by zero-padding the audio. Otherwise, it will be discarded. If there aren't enough frames for one partial utterance, this parameter is ignored so that the function always returns at least one slice. :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If <return_partials> is True, the partial utterances as a numpy array of float32 of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. """ # Compute where to split the utterance into partials and pad the waveform with zeros if # the partial utterances cover a larger range. wav_slices, mel_slices = self.compute_partial_slices( len(wav), rate, min_coverage) max_wave_length = wav_slices[-1].stop if max_wave_length >= len(wav): wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") # Split the utterance into partials and forward them through the model mel = audio.wav_to_mel_spectrogram(wav) mels = np.array([mel[s] for s in mel_slices]) with torch.no_grad(): mels = torch.from_numpy(mels).to(self.device) partial_embeds = self(mels).cpu().numpy() # Compute the utterance embedding from the partial embeddings raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) if return_partials: return embed, partial_embeds, wav_slices return embed
def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = {line.split(",")[0] for line in sources_file} except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") audio_durs = [] for extension in _AUDIO_EXTENSIONS: for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) audio_durs.append(len(wav) / sampling_rate) sources_file.close() return audio_durs
def preprocess_speaker(args): speaker, files = args # Give a name to the speaker that includes its dataset parts = list(speaker_dir.relative_to(datasets_root).parts) parts.append(speaker) speaker_name = "_".join(parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = {line.split(",")[0] for line in sources_file} except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath, out_fname in files: # Check if the target output file already exists if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform try: wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue except: print('wave preprocess error') continue # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close()
def preprocess(in_fpath, out_fpath, parent_path): source_text = parent_path / "_sources.txt" sources_file = source_text.open("w") # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: print("empty audio file") # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: print("{} < {}, number of frames is less than partials_n_frames".format(len(frames), partials_n_frames)) np.save(out_fpath, frames) sources_file.write("%s,%s\n" % (out_fpath.name + '.npy', in_fpath.name)) sources_file.close() return frames
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): """ Computes an embedding for a single utterance. # TODO: handle multiple wavs to benefit from batching on GPU :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 :param using_partials: if True, then the utterance is split in partial utterances of <partial_utterance_n_frames> frames and the utterance embedding is computed from their normalized average. If False, the utterance is instead computed from feeding the entire spectogram to the network. :param return_partials: if True, the partial embeddings will also be returned along with the wav slices that correspond to the partial embeddings. :param kwargs: additional arguments to compute_partial_splits() :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If <return_partials> is True, the partial utterances as a numpy array of float32 of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. If <using_partials> is simultaneously set to False, both these values will be None instead. """ # Extract raw audio patches for fCNN # Process the entire utterance if not using partials if not using_partials: if (mel_n_channels == 40): frames = audio.wav_to_mel_spectrogram(wav) else: win = np.hamming(int(sampling_rate * 0.02)) inc = int(win.shape[0] / 2) frames = get_frame_from_file(wav, win=win, inc=inc, sr=sampling_rate, n_channels=1, duration=None) frames = np.transpose(frames) embed = embed_frames_batch(frames[None, ...], **kwargs)[0] if return_partials: return embed, None, None return embed # Compute where to split the utterance into partials and pad if necessary # Set min_pad average to 1.0 to snip last slice that does not have 200 frames, default value was 0.75 samples_per_frame = int((sampling_rate * mel_window_step / 1000)) n_frames = int(np.ceil((len(wav) + 1) / samples_per_frame)) if (n_frames < partials_n_frames): print('Audio too short! Skipping...') embed = None partial_embeds = None wave_slices = None if return_partials: return embed, partial_embeds, wave_slices return embed wave_slices, mel_slices = compute_partial_slices(len(wav), min_pad_coverage=0.75) max_wave_length = wave_slices[-1].stop if max_wave_length >= len(wav): wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") # Split the utterance into partials if (mel_n_channels == 40): frames = audio.wav_to_mel_spectrogram(wav) else: win = np.hamming(int(sampling_rate * 0.02)) inc = int(win.shape[0] / 2) frames = get_frame_from_file(wav, win=win, inc=inc, sr=sampling_rate, n_channels=1, duration=None) frames = np.transpose(frames) if ( frames.shape[0] < mel_slices[-1].stop ): # This ensures that the number of frames in 'frames' corresponds to the expected melshapes pad_len = mel_slices[-1].stop - frames.shape[0] frames = np.concatenate((frames, frames[-1 * pad_len:]), axis=0) frames_batch = np.array([frames[s] for s in mel_slices]) partial_embeds = embed_frames_batch(frames_batch, **kwargs) # Compute the utterance embedding from the partial embeddings raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) if return_partials: return embed, partial_embeds, wave_slices return embed