def embed_utterance(self, wav: np.ndarray, return_partials=False, rate=1.3, min_coverage=0.75): """ Computes an embedding for a single utterance. The utterance is divided in partial utterances and an embedding is computed for each. The complete utterance embedding is the L2-normed average embedding of the partial utterances. TODO: independent batched version of this function :param wav: a preprocessed utterance waveform as a numpy array of float32 :param return_partials: if True, the partial embeddings will also be returned along with the wav slices corresponding to each partial utterance. :param rate: how many partial utterances should occur per second. Partial utterances must cover the span of the entire utterance, thus the rate should not be lower than the inverse of the duration of a partial utterance. By default, partial utterances are 1.6s long and the minimum rate is thus 0.625. :param min_coverage: when reaching the last partial utterance, it may or may not have enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present, then the last partial utterance will be considered by zero-padding the audio. Otherwise, it will be discarded. If there aren't enough frames for one partial utterance, this parameter is ignored so that the function always returns at least one slice. :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If <return_partials> is True, the partial utterances as a numpy array of float32 of shape (n_partials, model_embedding_size) and the wav partials as a list of slices will also be returned. """ # Compute where to split the utterance into partials and pad the waveform with zeros if # the partial utterances cover a larger range. wav_slices, mel_slices = self.compute_partial_slices( len(wav), rate, min_coverage) max_wave_length = wav_slices[-1].stop if max_wave_length >= len(wav): wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") # Split the utterance into partials and forward them through the model mel = audio.wav_to_mel_spectrogram(wav) mels = np.expand_dims(np.array([mel[s] for s in mel_slices])[-1], 0) # print(mel.shape, mels.shape) with torch.no_grad(): mels = torch.from_numpy(mels).to(self.device) partial_embeds = self(mels).cpu().numpy() # Compute the utterance embedding from the partial embeddings raw_embed = np.mean(partial_embeds, axis=0) embed = raw_embed / np.linalg.norm(raw_embed, 2) if return_partials: return embed, partial_embeds, wav_slices return embed
def d_wav2spec(wav): wav_slices, mel_slices = compute_partial_slices(len(wav)) max_wave_length = wav_slices[-1].stop if max_wave_length >= len(wav): wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") # Split the utterance into partials and forward them through the model mel = audio.wav_to_mel_spectrogram(wav) # mels = np.array([mel[s] for s in mel_slices]) return mel, mel_slices
def preprocess_speaker(speaker_dir: Path): # Give a name to the speaker that includes its dataset speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) # Create an output directory with that name, as well as a txt file containing a # reference to each source file. speaker_out_dir = out_dir.joinpath(speaker_name) speaker_out_dir.mkdir(exist_ok=True) sources_fpath = speaker_out_dir.joinpath("_sources.txt") # There's a possibility that the preprocessing was interrupted earlier, check if # there already is a sources file. if sources_fpath.exists(): try: with sources_fpath.open("r") as sources_file: existing_fnames = { line.split(",")[0] for line in sources_file } except: existing_fnames = {} else: existing_fnames = {} # Gather all audio files for that speaker recursively sources_file = sources_fpath.open("a" if skip_existing else "w") for in_fpath in speaker_dir.glob("**/*.%s" % extension): # Check if the target output file already exists out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) out_fname = out_fname.replace(".%s" % extension, ".npy") if skip_existing and out_fname in existing_fnames: continue # Load and preprocess the waveform wav = audio.preprocess_wav(in_fpath) if len(wav) == 0: continue # Create the mel spectrogram, discard those that are too short frames = audio.wav_to_mel_spectrogram(wav) if len(frames) < partials_n_frames: print(f'{in_fpath} skipped, mel_len: {len(frames)}') continue out_fpath = speaker_out_dir.joinpath(out_fname) np.save(out_fpath, frames) logger.add_sample(duration=len(wav) / sampling_rate) sources_file.write("%s,%s\n" % (out_fname, in_fpath)) sources_file.close()