def test_cut_pairs_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_source_frames=1000, ) sampled_src_cuts = [] sampled_tgt_cuts = [] for src_batch, tgt_batch in sampler: # Invariant 0: The order of source and target cut IDs is preserved within each batch. assert list(src_batch.ids) == list(tgt_batch.ids) sampled_src_cuts.extend(src_batch) sampled_tgt_cuts.extend(tgt_batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_src_cuts) == len(cut_set) assert len(sampled_tgt_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_src_cuts)) == len(sampled_src_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_src_cuts] != [c.id for c in lazy_cuts]
def test_single_cut_sampler_lazy_shuffle(sampler_cls): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) sampler = sampler_cls( lazy_cuts, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_duration=10.0, ) sampled_cuts = [] for batch in sampler: sampled_cuts.extend(batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampled_cuts) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampled_cuts)) == len(sampled_cuts) # Invariant 3: the items are shuffled assert [c.id for c in sampled_cuts] != [c.id for c in lazy_cuts]
def test_bucketing_sampler_raises_value_error_on_lazy_cuts_input(): cut_set = DummyManifest(CutSet, begin_id=0, end_id=2) with NamedTemporaryFile(suffix=".jsonl") as f: cut_set.to_jsonl(f.name) lazy_cuts = CutSet.from_jsonl_lazy(f.name) with pytest.raises(ValueError): sampler = BucketingSampler( lazy_cuts, max_duration=10.0, )
def train_cuts(self) -> CutSet: logging.info("About to get train cuts") path = ( self.args.feature_dir / f"gigaspeech_cuts_{self.args.subset}{get_context_suffix(self.args)}.jsonl.gz" ) if self.args.subset in ["L", "XL"]: # "L" and "XL" partitions are large enough that we have to read their manifests lazily; # The "CutSet" holds a file handle and reads the items sequentially on-the-fly to avoid # wasting memory and time pre-reading everything. Some operations on "CutSet" won't work, # e.g. shuffling (or they would have read everything into memory in the process). # We expect that the manifests read lazily are pre-shuffled, otherwise you might experience # issues with convergence. cuts_train = CutSet.from_jsonl_lazy(path) else: # For other subsets, just read everything into memory. cuts_train = CutSet.from_file(path) return cuts_train
def prepare_gigaspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike], dataset_parts: Union[str, Sequence[str]] = "auto", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) gigaspeech = GigaSpeech(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe some manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz", lazy=True, ) for part in subsets: logging.info(f"Processing GigaSpeech subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz" ): logging.info(f"GigaSpeech subset: {part} already prepared - skipping.") continue with RecordingSet.open_writer( output_dir / f"gigaspeech_recordings_{part}.jsonl.gz" ) as rec_writer, SupervisionSet.open_writer( output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz" ) as sup_writer, CutSet.open_writer( output_dir / f"gigaspeech_cuts_{part}.jsonl.gz" ) as cut_writer: for recording, segments in tqdm( parallel_map( parse_utterance, gigaspeech.audios("{" + part + "}"), repeat(gigaspeech.gigaspeech_dataset_dir), num_jobs=num_jobs, ), desc="Processing GigaSpeech JSON entries", ): # Fix and validate the recording + supervisions recordings, segments = fix_manifests( recordings=RecordingSet.from_recordings([recording]), supervisions=SupervisionSet.from_segments(segments), ) validate_recordings_and_supervisions( recordings=recordings, supervisions=segments ) # Create the cut since most users will need it anyway. # There will be exactly one cut since there's exactly one recording. cuts = CutSet.from_manifests( recordings=recordings, supervisions=segments ) # Write the manifests rec_writer.write(recordings[0]) for s in segments: sup_writer.write(s) cut_writer.write(cuts[0]) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), "cuts": CutSet.from_jsonl_lazy(cut_writer.path), } return dict(manifests)