def get_model_input(cls, task, audio: Union[str, torch.Tensor]): input_type = task.data_cfg.hub.get("input_type", "fbank80") if input_type == "fbank80_w_utt_cmvn": if isinstance(audio, str): feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio)) feat = feat.unsqueeze(0) # T x D -> 1 x T x D else: feat = kaldi.fbank(audio, num_mel_bins=80).numpy() # 1 x T x D elif input_type in {"waveform", "standardized_waveform"}: if isinstance(audio, str): feat, sr = get_wav(audio) # C x T feat, _ = convert_wav(feat, sr, to_sample_rate=16_000, to_mono=True) # C x T -> 1 x T else: feat = audio.numpy() else: raise ValueError(f"Unknown value: input_type = {input_type}") src_lengths = torch.Tensor([feat.shape[1]]).long() src_tokens = torch.from_numpy(feat) # 1 x T (x D) if input_type == "standardized_waveform": with torch.no_grad(): src_tokens = F.layer_norm(src_tokens, src_tokens.shape) return { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "prev_output_tokens": None, }, "target_lengths": None, "speaker": None, }
def get_features_or_waveform_from_uncompressed_zip( path, byte_offset, byte_size, need_waveform=False ): assert path.endswith(".zip") data = read_from_uncompressed_zip(path, byte_offset, byte_size) f = io.BytesIO(data) if is_npy_data(data): features_or_waveform = np.load(f) elif is_flac_or_wav_data(data): features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f) else: raise ValueError(f'Unknown file format for "{path}"') return features_or_waveform
def get_features_or_waveform_from_stored_zip( path, byte_offset, byte_size, need_waveform=False, use_sample_rate=None, ): assert path.endswith(".zip") data = read_from_stored_zip(path, byte_offset, byte_size) f = io.BytesIO(data) if is_npy_data(data): features_or_waveform = np.load(f) elif is_sf_audio_data(data): features_or_waveform = (get_waveform( f, always_2d=False, output_sample_rate=use_sample_rate)[0] if need_waveform else get_fbank(f)) else: raise ValueError(f'Unknown file format for "{path}"') return features_or_waveform
def get_features_from_npy_or_audio(path): ext = op.splitext(op.basename(path))[1] if ext not in {".npy", ".flac", ".wav"}: raise ValueError(f'Unsupported file format for "{path}"') return np.load(path) if ext == ".npy" else get_fbank(path)
def get_features_from_npy_or_audio(path): ext = Path(path).suffix if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: raise ValueError(f'Unsupported file format for "{path}"') return np.load(path) if ext == ".npy" else get_fbank(path)
def get_features_from_npy_or_audio(path): ext = op.splitext(op.basename(path))[1] if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: raise ValueError(f'Unsupported file format for "{path}"') return np.load(path) if ext == ".npy" else get_fbank(path)