def perturb_speed(self, factor: float, sampling_rate: int, affix_id: bool = True) -> 'SupervisionSegment': """ Return a ``SupervisionSegment`` that has time boundaries matching the recording/cut perturbed with the same factor. :param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster). :param sampling_rate: The sampling rate is necessary to accurately perturb the start and duration (going through the sample counts). :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields by affixing it with "_sp{factor}". :return: a modified copy of the current ``Recording``. """ start_sample = compute_num_samples(self.start, sampling_rate) num_samples = compute_num_samples(self.duration, sampling_rate) new_start = perturb_num_samples(start_sample, factor) / sampling_rate new_duration = perturb_num_samples(num_samples, factor) / sampling_rate return fastcopy(self, id=f'{self.id}_sp{factor}' if affix_id else self.id, recording_id=f'{self.recording_id}_sp{factor}' if affix_id else self.id, start=new_start, duration=new_duration, alignment={ type: [ item.perturb_speed(factor=factor, sampling_rate=sampling_rate) for item in ali ] for type, ali in self.alignment.items() } if self.alignment else None)
def read_audio( path_or_fd: Union[Pathlike, FileObject], offset: Seconds = 0.0, duration: Optional[Seconds] = None, force_audioread: Optional[bool] = False, ) -> Tuple[np.ndarray, int]: try: if force_audioread: raise Exception import soundfile as sf with sf.SoundFile(path_or_fd) as sf_desc: sampling_rate = sf_desc.samplerate if offset > 0: # Seek to the start of the target read sf_desc.seek(compute_num_samples(offset, sampling_rate)) if duration is not None: frame_duration = compute_num_samples(duration, sampling_rate) else: frame_duration = -1 # Load the target number of frames, and transpose to match librosa form return sf_desc.read(frames=frame_duration, dtype=np.float32, always_2d=False).T, sampling_rate except: return _audioread_load(path_or_fd, offset=offset, duration=duration)
def reverse_timestamps( self, offset: Seconds, duration: Optional[Seconds], sampling_rate: int) -> Tuple[Seconds, Optional[Seconds]]: """ This method helps estimate the original offset and duration for a recording before resampling was applied. We need this estimate to know how much audio to actually load from disk during the call to ``load_audio()``. In case of resampling, the timestamps might change slightly when using non-trivial pairs of sampling rates, e.g. 16kHz -> 22.05kHz, because the number of samples in the resampled audio might actually correspond to incrementally larger/smaller duration. E.g. 16kHz, 235636 samples correspond to 14.72725s duration; after resampling to 22.05kHz, it is 324736 samples which correspond to 14.727256235827664s duration. """ old_num_samples = compute_num_samples(offset, self.source_sampling_rate, rounding=ROUND_HALF_UP) old_offset = old_num_samples / self.source_sampling_rate if duration is not None: old_num_samples = compute_num_samples(duration, self.source_sampling_rate, rounding=ROUND_HALF_UP) old_duration = old_num_samples / self.source_sampling_rate else: old_duration = None return old_offset, old_duration
def reverse_timestamps( self, offset: Seconds, duration: Optional[Seconds], sampling_rate: int, ) -> Tuple[Seconds, Optional[Seconds]]: """ This method helps estimate the original offset and duration for a recording before tempo perturbation was applied. We need this estimate to know how much audio to actually load from disk during the call to ``load_audio()``. """ start_sample = compute_num_samples(offset, sampling_rate) num_samples = ( compute_num_samples(duration, sampling_rate) if duration is not None else None ) start_sample = perturb_num_samples(start_sample, 1 / self.factor) num_samples = ( perturb_num_samples(num_samples, 1 / self.factor) if num_samples is not None else None ) return ( start_sample / sampling_rate, num_samples / sampling_rate if num_samples is not None else None, )
def perturb_speed(self, factor: float, sampling_rate: int) -> 'AlignmentItem': """ Return an ``AlignmentItem`` that has time boundaries matching the recording/cut perturbed with the same factor. See :meth:`SupervisionSegment.perturb_speed` for details. """ start_sample = compute_num_samples(self.start, sampling_rate) num_samples = compute_num_samples(self.duration, sampling_rate) new_start = perturb_num_samples(start_sample, factor) / sampling_rate new_duration = perturb_num_samples(num_samples, factor) / sampling_rate return AlignmentItem(self.symbol, new_start, new_duration)
def read_audio(path: Pathlike, offset: Seconds, duration: Seconds) -> Tuple[np.ndarray, int]: import soundfile as sf with sf.SoundFile(path) as sf_desc: sampling_rate = sf_desc.samplerate if offset: # Seek to the start of the target read sf_desc.seek(compute_num_samples(offset, sampling_rate)) if duration is not None: frame_duration = compute_num_samples(duration, sampling_rate) else: frame_duration = -1 # Load the target number of frames, and transpose to match librosa form return sf_desc.read(frames=frame_duration, dtype=np.float32, always_2d=False).T, sampling_rate
def assert_and_maybe_fix_num_samples(audio: np.ndarray, offset: Seconds, duration: Optional[Seconds], recording: Recording) -> np.ndarray: # When resampling in high sampling rates (48k -> 44.1k) # it is difficult to estimate how sox will perform rounding; # we will just add/remove one sample to be consistent with # what we have estimated. # This effect is exacerbated by chaining multiple augmentations together. expected_num_samples = compute_num_samples( duration=duration if duration is not None else recording.duration - offset, sampling_rate=recording.sampling_rate) diff = expected_num_samples - audio.shape[1] if diff == 0: return audio # this is normal condition allowed_diff = int( ceil(AUGMENTATION_DURATION_TOLERANCE * recording.sampling_rate)) if 0 < diff <= allowed_diff: # note the extra colon in -1:, which preserves the shape audio = np.append(audio, audio[:, -diff:], axis=1) return audio elif -allowed_diff <= diff < 0: audio = audio[:, :diff] return audio else: raise ValueError( "The number of declared samples in the recording diverged from the one obtained " f"when loading audio (offset={offset}, duration={duration}). " f"This could be internal Lhotse's error or a faulty transform implementation. " "Please report this issue in Lhotse and show the " f"following: diff={diff}, audio.shape={audio.shape}, recording={recording}" )
def resample(self, sampling_rate: int) -> 'Recording': """ Return a new ``Recording`` that will be lazily resampled while loading audio. :param sampling_rate: The new sampling rate. :return: A resampled ``Recording``. """ resampling = [ Resample(source_sampling_rate=self.sampling_rate, target_sampling_rate=sampling_rate).to_dict() ] new_num_samples = compute_num_samples(self.duration, sampling_rate, rounding=ROUND_HALF_UP) # Duration might need an adjustment when doing a non-trivial resampling # (e.g. 16000 -> 22050), where the resulting number of samples cannot # correspond to old duration exactly. new_duration = new_num_samples / sampling_rate return fastcopy( self, duration=new_duration, num_samples=new_num_samples, sampling_rate=sampling_rate, transforms=(self.transforms or []) + resampling )
def test_num_frames( feature_set, feature_level, ): sr = 8000 duration = 12.059 config = OpenSmileConfig( feature_set=feature_set, feature_level=feature_level, sampling_rate=sr, resample=True, ) feature_extractor = OpenSmileExtractor(config=config) num_frames = compute_num_frames(duration, feature_extractor.frame_shift, sr) num_samples = compute_num_samples(duration, sr) signal = np.random.rand(1, num_samples) y = feature_extractor.extract(signal, sr) assert np.shape(y)[0] == num_frames
def memmap_raw_audio(wav_scp, f_memmapped, utt_list, dtype=np.float32, sampling_rate=16000, do_normalize=True): ''' Maps the wva.scp file from kaldi to a memory mapped numpy object. This allows for fast i/o when creating window minibathces from slices of training data. input args: wav_scp, f_memmapped output: utt_lens = {'utt_n': # utt_n frames, ...} offsets = {'utt_n': utt_n offset in memory mapped numpy file} data_shape = {#frames, feature_dimension} ''' import os dataset = os.path.dirname(wav_scp) print(dataset) if not os.path.exists(os.path.join(dataset, 'reco2dur')): p = subprocess.Popen(['./utils/data/get_reco2dur.sh', dataset], stdout=subprocess.PIPE) out = p.communicate() # Import lhotse and install if not available try: from lhotse import kaldi, CutSet except ImportError: from pip._internal import main as pip pip(['install', 'lhotse']) from lhotse import kaldi, CutSet from lhotse.utils import compute_num_samples data = kaldi.load_kaldi_data_dir(dataset, sampling_rate) cuts = CutSet.from_manifests(data[0], data[1]) dim = 1 utt_lens = {} for cut in cuts: sr = cut.recording.sampling_rate for sup in cut.supervisions: if sup.id not in utt_list: continue utt_lens[sup.id.encode()] = compute_num_samples(sup.duration, sr) data_shape = (sum(utt_lens.values()), dim) f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape) offsets = {} offset = 0 for cut in cuts: x_ = cut.recording.load_audio().T # Mean and variance normalize if do_normalize: x = (x_ - x_.mean()) / x_.std() else: x = x_ sr = cut.recording.sampling_rate for i, supervision in enumerate(cut.supervisions): k = supervision.id print('Utterance ', i, ' : ', k, ' : ', sr) start, dur = supervision.start, supervision.duration if k not in utt_list: continue start_sample = compute_num_samples(start, sr) end_sample = start_sample + utt_lens[k.encode()] m = x[start_sample:end_sample] offsets[k.encode()] = offset utt_lens[k.encode()] = m.shape[0] new_offset = offset + utt_lens[k.encode()] f[offset:new_offset, :] = m offset = new_offset print() del f return utt_lens, offsets, data_shape
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) for recording_id, path_or_cmd in recordings.items(): duration = get_duration(path_or_cmd) durations[recording_id] = duration recording_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / 'segments' if segments.is_file(): with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / 'feats.scp' if feats_scp.exists() and is_module_available('kaldiio'): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features(type='kaldiio', num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: " f"frame_shift must be not None. " f"Feature import omitted.") return recording_set, supervision_set, feature_set
def _expected_num_samples(self, offset: Seconds, duration: Optional[Seconds]) -> int: if offset == 0 and duration is None: return self.num_samples duration = duration if duration is not None else self.duration - offset return compute_num_samples(duration, sampling_rate=self.sampling_rate)
def prepare_peoples_speech( corpus_dir: Pathlike, output_dir: Pathlike, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests for The People's Speech. The metadata is read lazily and written to manifests in a stream to minimize the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet` without using excessive memory, we suggest to call it like:: >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...) >>> cuts = CutSet.from_manifests( ... recordings=peoples_speech["recordings"], ... supervisions=peoples_speech["supervisions"], ... output_path=..., ... lazy=True, ... ) :param corpus_dir: Pathlike, the path of the main data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recs_path = output_dir / "peoples-speech_recordings_all.jsonl.gz" sups_path = output_dir / "peoples-speech_supervisions_all.jsonl.gz" if recs_path.is_file() and sups_path.is_file(): # Nothing to do: just open the manifests in lazy mode. return { "recordings": RecordingSet.from_jsonl_lazy(recs_path), "supervisions": SupervisionSet.from_jsonl_lazy(sups_path), } exist = 0 tot = 0 err = 0 with RecordingSet.open_writer( recs_path, ) as rec_writer, SupervisionSet.open_writer( sups_path, ) as sup_writer: for item in tqdm( # Note: People's Speech manifest.json is really a JSONL. load_jsonl(corpus_dir / "manifest.json"), desc= "Converting People's Speech manifest.json to Lhotse manifests", ): for duration_ms, text, audio_path in zip( *item["training_data"].values()): full_path = corpus_dir / audio_path tot += 1 if not full_path.exists(): # If we can't find some data, we'll just continue and some items # were missing later. continue exist += 1 try: audio_info = info(full_path) duration = duration_ms / 1000 r = Recording( id=full_path.stem, sampling_rate=audio_info.samplerate, num_samples=compute_num_samples( duration, audio_info.samplerate), duration=duration, sources=[ AudioSource( type="file", channels=[0], source=str(full_path), ) ], ) s = SupervisionSegment( id=r.id, recording_id=r.id, start=0, duration=r.duration, channel=0, text=text, language="English", custom={"session_id": item["identifier"]}, ) validate_recordings_and_supervisions(recordings=r, supervisions=s) rec_writer.write(r) sup_writer.write(s) except Exception as e: # If some files are missing (e.g. somebody is working on a subset # of 30.000 hours), we won't interrupt processing; we will only # do so for violated assertions. if isinstance(e, AssertionError): raise err += 1 continue if exist < tot or err > 0: warnings.warn( f"We finished preparing The People's Speech Lhotse manifests. " f"Out of {tot} entries in the original manifest, we found {exist} " f"audio files existed, out of which {err} had errors during processing." ) return { "recordings": rec_writer.open_manifest(), "supervisions": sup_writer.open_manifest(), }
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd.endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items() ) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [sup_string.strip().split() for sup_string in f] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate ), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments ) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat_shape.num_rows, num_features=mat_shape.num_cols, frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat_shape.num_rows * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id].recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader( f"scp:{feats_scp}" ) ) else: warnings.warn( "Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted." ) return recording_set, supervision_set, feature_set