def get_features_or_waveform(path: str, need_waveform=False):
    """Get speech features from .npy file or waveform from .wav/.flac file.
    The file may be inside an uncompressed ZIP file and is accessed via byte
    offset and length.

    Args:
        path (str): File path in the format of "<.npy/.wav/.flac path>" or
        "<zip path>:<byte offset>:<byte length>".
        need_waveform (bool): return waveform instead of features.

    Returns:
        features_or_waveform (numpy.ndarray): speech features or waveform.
    """
    _path, *extra = path.split(":")
    if not op.exists(_path):
        raise FileNotFoundError(f"File not found: {_path}")

    if len(extra) == 0:
        if need_waveform:
            return get_waveform(_path)
        return get_features_from_npy_or_audio(_path)
    elif len(extra) == 2:
        extra = [int(i) for i in extra]
        features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
            _path, extra[0], extra[1], need_waveform=need_waveform)
    else:
        raise ValueError(f"Invalid path: {path}")

    return features_or_waveform
Beispiel #2
0
def compute_num_frames_from_feat_or_waveform(rxfile: str) -> int:
    if re.search(r"\.ark:\d+$", rxfile.strip()) is not None:  # from feats.scp
        if not has_kaldi_io:
            raise ImportError(
                "Please install kaldi_io with: pip install kaldi_io")
        try:
            feat = kaldi_io.read_mat(rxfile)
        except Exception:
            raise Exception("failed to read feature matrix {}.".format(rxfile))
        assert feat is not None and isinstance(feat, np.ndarray)
        num_frames = feat.shape[0]
    elif re.search(r"\|$", rxfile.strip()) is not None:  # from a command
        source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout)
        waveform, sample_rate = get_waveform(source, always_2d=True)
        num_frames = num_samples_to_num_frames(waveform.shape[1],
                                               sample_rate,
                                               frame_length=25.0,
                                               frame_shift=10.0)
    else:  # from a raw waveform file
        if not has_soundfile:
            raise ImportError(
                "Please install soundfile with: pip install soundfile")
        info = soundfile.info(rxfile)
        num_frames = num_samples_to_num_frames(info.frames,
                                               info.samplerate,
                                               frame_length=25.0,
                                               frame_shift=10.0)
    return num_frames
def get_features_or_waveform(path: str,
                             need_waveform=False,
                             use_sample_rate=None):
    """Get speech features from .npy file or waveform from .wav/.flac file.
    The file may be inside an uncompressed ZIP file and is accessed via byte
    offset and length.

    Args:
        path (str): File path in the format of "<.npy/.wav/.flac path>" or
        "<zip path>:<byte offset>:<byte length>".
        need_waveform (bool): return waveform instead of features.
        use_sample_rate (int): change sample rate for the input wave file

    Returns:
        features_or_waveform (numpy.ndarray): speech features or waveform.
    """
    _path, slice_ptr = parse_path(path)
    if len(slice_ptr) == 0:
        if need_waveform:
            return get_waveform(_path,
                                always_2d=False,
                                output_sample_rate=use_sample_rate)[0]
        return get_features_from_npy_or_audio(_path)
    elif len(slice_ptr) == 2:
        features_or_waveform = get_features_or_waveform_from_stored_zip(
            _path,
            slice_ptr[0],
            slice_ptr[1],
            need_waveform=need_waveform,
            use_sample_rate=use_sample_rate,
        )
    else:
        raise ValueError(f"Invalid path: {path}")

    return features_or_waveform
Beispiel #4
0
 def __getitem__(self,
                 n: int) -> Tuple[torch.Tensor, int, str, str, str, str]:
     wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, \
         utt_id = self.data[n]
     waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset)
     waveform = torch.from_numpy(waveform)
     return waveform, sr, src_utt, tgt_utt, spk_id, utt_id
def get_features_or_waveform_from_uncompressed_zip(
    path, byte_offset, byte_size, need_waveform=False
):
    assert path.endswith(".zip")
    data = read_from_uncompressed_zip(path, byte_offset, byte_size)
    f = io.BytesIO(data)
    if is_npy_data(data):
        features_or_waveform = np.load(f)
    elif is_flac_or_wav_data(data):
        features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f)
    else:
        raise ValueError(f'Unknown file format for "{path}"')
    return features_or_waveform
Beispiel #6
0
    def read_audio(self, path, ref_len=None):
        path, *extra = path.split(":")
        assert len(extra) == 2
        assert path.endswith(".zip")

        data = read_from_uncompressed_zip(path, int(extra[0]), int(extra[1]))
        f = io.BytesIO(data)
        wav, sr = get_waveform(f)
        assert sr == self.task.cfg.sample_rate, sr
        if wav.ndim == 2:
            wav = wav.mean(-1)
        assert wav.ndim == 1, wav.ndim
        if ref_len is not None and abs(ref_len - len(wav)) > 160:
            logging.warning(f"ref {ref_len} != read {len(wav)} ({path})")
        return wav
def process(
    line: str, n_bins: int = 80, feature_type: str = "fbank"
) -> Tuple[np.ndarray, np.ndarray, int]:
    _, rxfile = line.rstrip().split(None, 1)
    if re.search(r"\|$", rxfile) is not None:  # from a command
        source = BytesIO(run(rxfile[:-1], shell=True, stdout=PIPE).stdout)
    else:  # from a raw waveform file
        source = rxfile
    waveform, sample_rate = get_waveform(source, normalization=False, always_2d=True)
    feat = get_torchaudio_fbank_or_mfcc(
        waveform, sample_rate, n_bins=n_bins, feature_type=feature_type
    )
    cur_sum = feat.sum(axis=0)
    cur_frames = feat.shape[0]
    cur_unnorm_var = np.var(feat, axis=0) * cur_frames
    return cur_sum, cur_unnorm_var, cur_frames
 def _get_features(self, i):
     if self.input_format == "feat":
         feat = kaldi_io.read_mat(self.rxfiles[i])
     else:
         if self.input_format == "command":
             source = BytesIO(run(self.rxfiles[i][:-1], shell=True, stdout=PIPE).stdout)
         else:
             source = self.rxfiles[i]
         waveform, sample_rate = get_waveform(source, normalization=False, always_2d=True)
         feat = get_torchaudio_fbank_or_mfcc(waveform, sample_rate, n_bins=self.feat_dim, feature_type=self.feature_type)
         if self.feature_transforms is not None:
             feat = self.feature_transforms(feat)
     if self.specaugment_config is not None and self.specaugment_config != "":
         with data_utils.numpy_seed(self.seed, self.epoch, i):
             feat = specaug(feat, **eval(self.specaugment_config))
     return feat
Beispiel #9
0
def load_dataset_raw_to_waveforms(
    file_name,
    dataset_size=None,
    need_waveform=True,
    sample_rate=16000,
    read_using_soundfile=False,
):
    """Load raw dataset from w2v tsv file. Optionally get waveforms"""
    data = []
    with open(file_name, "r") as fp:
        lines = fp.readlines()
        data = [
            os.path.join(lines[0].strip(),
                         line.strip().split("\t")[0]) for line in lines[1:]
        ]

    if dataset_size:
        data = data[:dataset_size]

    if not need_waveform:
        return data

    features = []
    if read_using_soundfile:
        for _i, d in enumerate(data):
            wav = sf.read(d)[0]
            if wav.ndim == 2:
                wav = wav.mean(-1)
            features.append(torch.from_numpy(wav).float().view(1, -1))
    else:
        for i, d in enumerate(data):
            _path, slice_ptr = parse_path(d)
            if len(slice_ptr) == 0:
                feat = get_waveform(_path,
                                    always_2d=True,
                                    output_sample_rate=sample_rate)[0]
                features.append({
                    "id": i,
                    "net_input": {
                        "src_tokens": torch.tensor(feat),
                        "src_lengths": torch.tensor([feat.shape[1]]),
                    },
                })
            else:
                raise Exception("Currently unsupported data format")
    return features
def get_features_or_waveform_from_stored_zip(
    path,
    byte_offset,
    byte_size,
    need_waveform=False,
    use_sample_rate=None,
):
    assert path.endswith(".zip")
    data = read_from_stored_zip(path, byte_offset, byte_size)
    f = io.BytesIO(data)
    if is_npy_data(data):
        features_or_waveform = np.load(f)
    elif is_sf_audio_data(data):
        features_or_waveform = (get_waveform(
            f, always_2d=False, output_sample_rate=use_sample_rate)[0]
                                if need_waveform else get_fbank(f))
    else:
        raise ValueError(f'Unknown file format for "{path}"')
    return features_or_waveform