Python CutSet.from_fileの例、lhotse.CutSet.from_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_with_temporal_array_move_to_memory_large_offset():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    cut.start = 10.0
    cut.duration = 1.5

    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w:
        arr = np.array(
            np.arange(
                compute_num_frames(cut.duration,
                                   frame_shift=0.01,
                                   sampling_rate=16000)))
        cut.custom_array = w.store_array(
            key="dummy-key",
            value=arr,
            frame_shift=0.01,
            temporal_dim=0,
            start=cut.start,
        )

        cut_mem = cut.move_to_memory()
        arr_mem = cut_mem.load_custom_array()

        assert arr.dtype == arr_mem.dtype
        np.testing.assert_equal(arr, arr_mem)

        arr_trunc = cut.truncate(duration=0.5).load_custom_array()
        arr_mem_trunc = cut_mem.truncate(duration=0.5).load_custom_array()

        assert arr_trunc.dtype == arr_mem_trunc.dtype
        np.testing.assert_equal(arr_trunc, arr_mem_trunc)

コード例 #2

0

ファイルを表示

ファイル: test_webdataset.py プロジェクト: AmirHussein96/lhotse

def test_cutset_from_webdataset_sharded_pipe():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json")
    cut = cuts[0]
    cuts = []
    for i in range(10):
        cuts.append(fastcopy(cut, id=cut.id + "-" + str(i)))
    cuts = CutSet.from_cuts(cuts)

    with TemporaryDirectory() as dir_path:
        tar_pattern = f"pipe:gzip -c > {dir_path}/shard-%06d.tar.gz"
        export_to_webdataset(cuts, output_path=tar_pattern, shard_size=2)

        # disabling shard shuffling for testing purposes here
        cuts_ds = CutSet.from_webdataset(
            "pipe:gunzip -c " + dir_path + "/shard-{000000..000004}.tar.gz",
            shuffle_shards=False,
        )

        assert list(cuts.ids) == list(cuts_ds.ids)

        for c, cds in zip(cuts, cuts_ds):
            np.testing.assert_equal(c.load_audio(), cds.load_audio())
            np.testing.assert_almost_equal(
                c.load_features(), cds.load_features(), decimal=2
            )

コード例 #3

0

ファイルを表示

ファイル: test_webdataset.py プロジェクト: AmirHussein96/lhotse

def test_webdataset_sampler_epoch_increment():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json").repeat(10)

    with TemporaryDirectory() as dir_path:
        tar_pattern = f"{dir_path}/shard-%06d.tar"
        export_to_webdataset(cuts, output_path=tar_pattern, shard_size=1)

        cuts_ds = CutSet.from_webdataset(
            [str(p) for p in Path(dir_path).glob("*.tar")], shuffle_shards=True
        )
        sampler = DynamicCutSampler(cuts_ds, max_cuts=1)
        dloader = DataLoader(
            IterableDatasetWrapper(DummyDataset(), sampler, auto_increment_epoch=True),
            batch_size=None,
            num_workers=1,
            persistent_workers=True,
        )

        epoch_batches = {}
        for epoch in [0, 1]:
            batches = []
            for batch in dloader:
                for cut in batch:
                    batches.append(cut)
            epoch_batches[epoch] = CutSet.from_cuts(batches)

        # Both epochs have the same cut IDs.
        assert sorted(epoch_batches[0].ids) == sorted(epoch_batches[1].ids)
        # Both epochs have different cut order (shards were re-shuffled).
        assert list(epoch_batches[0].ids) != list(epoch_batches[1].ids)

コード例 #4

0

ファイルを表示

def extract_cuts(
    cutset: Pathlike,
    output_cutset: Pathlike,
    storage_path: Pathlike,
    feature_manifest: Optional[Pathlike],
    storage_type: str,
    num_jobs: int,
):
    """
    Extract features for cuts in a given CUTSET manifest.
    The features are stored in STORAGE_PATH, and the output manifest
    with features is stored in OUTPUT_CUTSET.
    """
    from lhotse import CutSet

    cuts: CutSet = CutSet.from_file(cutset)
    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())
    cuts = cuts.compute_and_store_features(
        extractor=feature_extractor,
        storage_path=storage_path,
        num_jobs=num_jobs,
        storage_type=get_writer(storage_type),
    )
    Path(output_cutset).parent.mkdir(parents=True, exist_ok=True)
    cuts.to_file(output_cutset)

コード例 #5

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_move_to_memory_load_custom_false():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    cut.custom_array = Array("irrelevant", "irrelevant", "irrelevant", [10])

    cut_mem = cut.move_to_memory(load_custom=False)

    assert cut.custom_array == cut_mem.custom_array  # nothing was copied

コード例 #6

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_move_to_memory_load_features_false():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    assert cut.has_features

    cut_mem = cut.move_to_memory(load_features=False)

    assert cut.features == cut_mem.features  # nothing was copied

コード例 #7

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_move_to_memory_load_audio_false():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    assert cut.has_recording

    cut_mem = cut.move_to_memory(load_audio=False)

    assert cut.recording == cut_mem.recording  # nothing was copied

コード例 #8

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_with_features_move_to_memory():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]

    arr = cut.load_features()
    assert arr is not None

    cut_mem = cut.move_to_memory()
    arr_mem = cut_mem.load_features()

    np.testing.assert_almost_equal(arr, arr_mem, decimal=2)

コード例 #9

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_cut_with_array_move_to_memory():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w:
        arr = np.array([0, 1, 2, 3])
        cut.custom_array = w.store_array(key="dummy-key", value=arr)

        cut_mem = cut.move_to_memory()
        arr_mem = cut_mem.load_custom_array()

        assert arr.dtype == arr_mem.dtype
        np.testing.assert_equal(arr, arr_mem)

コード例 #10

0

ファイルを表示

ファイル: test_cut_with_in_memory_data.py プロジェクト: oplatek/lhotse

def test_features_move_to_memory():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    feats = cut.features
    assert feats is not None

    arr = feats.load()

    feats_mem = feats.move_to_memory()

    arr_mem = feats_mem.load()

    np.testing.assert_equal(arr, arr_mem)

コード例 #11

0

ファイルを表示

ファイル: test_webdataset_ddp.py プロジェクト: AmirHussein96/lhotse

def prepare_data(total_cuts: int, root: Pathlike) -> Tuple[int, List[str]]:
    """
    Loads a cutset with 1 cut, repeats it a few times, and stores shards
    in tmp dir with 1 cut per shard for easy testing arithmetic.
    """
    cuts = CutSet.from_file("test/fixtures/libri/cuts_no_feats.json").repeat(
        total_cuts)
    Path(root).mkdir(exist_ok=True)
    n_shards = export_to_webdataset(cuts,
                                    f"{root}/shard-%06d.tar",
                                    shard_size=1,
                                    audio_format="wav",
                                    verbose=False)
    return n_shards, sorted(cuts.ids)

コード例 #12

0

ファイルを表示

ファイル: asr_datamodule.py プロジェクト: underdogliu/snowfall

 def train_cuts(self) -> CutSet:
     logging.info("About to get train cuts")
     path = (
         self.args.feature_dir /
         f"gigaspeech_cuts_{self.args.subset}{get_context_suffix(self.args)}.jsonl.gz"
     )
     if self.args.subset in ["L", "XL"]:
         # "L" and "XL" partitions are large enough that we have to read their manifests lazily;
         # The "CutSet" holds a file handle and reads the items sequentially on-the-fly to avoid
         # wasting memory and time pre-reading everything. Some operations on "CutSet" won't work,
         # e.g. shuffling (or they would have read everything into memory in the process).
         # We expect that the manifests read lazily are pre-shuffled, otherwise you might experience
         # issues with convergence.
         cuts_train = CutSet.from_jsonl_lazy(path)
     else:
         # For other subsets, just read everything into memory.
         cuts_train = CutSet.from_file(path)
     return cuts_train

コード例 #13

0

ファイルを表示

ファイル: test_webdataset.py プロジェクト: AmirHussein96/lhotse

def test_export_to_webdataset():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json")
    cut = cuts[0]
    cuts = []
    for i in range(10):
        cuts.append(fastcopy(cut, id=cut.id + "-" + str(i)))
    cuts = CutSet.from_cuts(cuts)

    with NamedTemporaryFile(suffix=".tar") as f:
        export_to_webdataset(cuts, output_path=f.name)
        f.flush()

        ds = webdataset.WebDataset(f.name)

        dicts = (pickle.loads(data["data"]) for data in ds)

        cuts_ds = CutSet.from_dicts(dicts)

    assert list(cuts.ids) == list(cuts_ds.ids)

コード例 #14

0

ファイルを表示

def extract_cuts_batch(
    cutset: Pathlike,
    output_cutset: Pathlike,
    storage_path: Pathlike,
    feature_manifest: Optional[Pathlike],
    storage_type: str,
    num_jobs: int,
    batch_duration: Seconds,
):
    """
    Extract features for cuts in a given CUTSET manifest.
    The features are stored in STORAGE_PATH, and the output manifest
    with features is stored in OUTPUT_CUTSET.

    This version enables CUDA acceleration for feature extractors
    that support it (e.g., kaldifeat extractors).

    \b
    Example usage of kaldifeat fbank with CUDA:

        $ pip install kaldifeat  # note: ensure it's compiled with CUDA

        $ lhotse feat write-default-config -f kaldifeat-fbank feat.yml

        $ sed 's/device: cpu/device: cuda/' feat.yml feat-cuda.yml

        $ lhotse feat extract-cuts-batch -f feat-cuda.yml cuts.jsonl cuts_with_feats.jsonl feats.h5
    """
    from lhotse import CutSet

    cuts: CutSet = CutSet.from_file(cutset)
    feature_extractor = (FeatureExtractor.from_yaml(feature_manifest)
                         if feature_manifest is not None else Fbank())
    cuts = cuts.compute_and_store_features_batch(
        extractor=feature_extractor,
        storage_path=storage_path,
        batch_duration=batch_duration,
        num_workers=num_jobs,
        storage_type=get_writer(storage_type),
    )
    Path(output_cutset).parent.mkdir(parents=True, exist_ok=True)
    cuts.to_file(output_cutset)

コード例 #15

0

ファイルを表示

ファイル: test_webdataset.py プロジェクト: AmirHussein96/lhotse

def test_cutset_from_webdataset():
    cuts = CutSet.from_file("test/fixtures/libri/cuts.json")
    cut = cuts[0]
    cuts = []
    for i in range(10):
        cuts.append(fastcopy(cut, id=cut.id + "-" + str(i)))
    cuts = CutSet.from_cuts(cuts)

    with NamedTemporaryFile(suffix=".tar") as f:
        export_to_webdataset(cuts, output_path=f.name)
        f.flush()

        cuts_ds = CutSet.from_webdataset(f.name)

        assert list(cuts.ids) == list(cuts_ds.ids)

        for c, cds in zip(cuts, cuts_ds):
            np.testing.assert_equal(c.load_audio(), cds.load_audio())
            np.testing.assert_almost_equal(
                c.load_features(), cds.load_features(), decimal=2
            )

コード例 #16

0

ファイルを表示

def cuts():
    return CutSet.from_file("test/fixtures/libri/cuts.json")

コード例 #17

0

ファイルを表示

ファイル: ctc_train.py プロジェクト: underdogliu/snowfall

def main():
    fix_random_seed(42)

    start_epoch = 0
    num_epochs = 8

    exp_dir = "exp-lstm-adam-ctc-musan"
    setup_logger("{}/log/log-train".format(exp_dir))
    tb_writer = SummaryWriter(log_dir=f"{exp_dir}/tensorboard")

    # load L, G, symbol_table
    lang_dir = Path("data/lang_nosp")
    phone_symbol_table = k2.SymbolTable.from_file(lang_dir / "phones.txt")
    word_symbol_table = k2.SymbolTable.from_file(lang_dir / "words.txt")

    logging.info("Loading L.fst")
    if (lang_dir / "Linv.pt").exists():
        L_inv = k2.Fsa.from_dict(torch.load(lang_dir / "Linv.pt"))
    else:
        with open(lang_dir / "L.fst.txt") as f:
            L = k2.Fsa.from_openfst(f.read(), acceptor=False)
            L_inv = k2.arc_sort(L.invert_())
            torch.save(L_inv.as_dict(), lang_dir / "Linv.pt")

    graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv,
                                              phones=phone_symbol_table,
                                              words=word_symbol_table)
    phone_ids = get_phone_symbols(phone_symbol_table)

    # load dataset
    feature_dir = Path("exp/data")
    logging.info("About to get train cuts")
    cuts_train = CutSet.from_file(feature_dir / "gigaspeech_cuts_S.jsonl.gz")
    logging.info("About to get dev cuts")
    cuts_dev = CutSet.from_file(
        feature_dir / "gigaspeech_cuts_DEV.jsonl.gz").subset(first=1000)
    logging.info("About to get Musan cuts")
    cuts_musan = CutSet.from_json(feature_dir / "cuts_musan.json.gz")

    logging.info("About to create train dataset")
    train = K2SpeechRecognitionDataset(
        cuts_train,
        cut_transforms=[
            CutConcatenate(),
            CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20)),
        ],
    )
    train_sampler = SingleCutSampler(
        cuts_train,
        max_frames=90000,
        shuffle=True,
    )
    logging.info("About to create train dataloader")
    train_dl = torch.utils.data.DataLoader(train,
                                           sampler=train_sampler,
                                           batch_size=None,
                                           num_workers=4)
    logging.info("About to create dev dataset")
    validate = K2SpeechRecognitionDataset(cuts_dev)
    valid_sampler = SingleCutSampler(cuts_dev, max_frames=90000)
    logging.info("About to create dev dataloader")
    valid_dl = torch.utils.data.DataLoader(validate,
                                           sampler=valid_sampler,
                                           batch_size=None,
                                           num_workers=1)

    if not torch.cuda.is_available():
        logging.error("No GPU detected!")
        sys.exit(-1)

    logging.info("About to create model")
    device_id = 0
    device = torch.device("cuda", device_id)
    model = TdnnLstm1b(
        num_features=80,
        num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
        subsampling_factor=4,
    )

    model.to(device)
    describe(model)

    learning_rate = 1e-3
    optimizer = optim.AdamW(model.parameters(),
                            lr=learning_rate,
                            weight_decay=5e-4)

    best_objf = np.inf
    best_valid_objf = np.inf
    best_epoch = start_epoch
    best_model_path = os.path.join(exp_dir, "best_model.pt")
    best_epoch_info_filename = os.path.join(exp_dir, "best-epoch-info")
    global_batch_idx_train = 0  # for logging only

    if start_epoch > 0:
        model_path = os.path.join(exp_dir,
                                  "epoch-{}.pt".format(start_epoch - 1))
        ckpt = load_checkpoint(filename=model_path,
                               model=model,
                               optimizer=optimizer)
        best_objf = ckpt["objf"]
        best_valid_objf = ckpt["valid_objf"]
        global_batch_idx_train = ckpt["global_batch_idx_train"]
        logging.info(
            f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}"
        )

    for epoch in range(start_epoch, num_epochs):
        train_sampler.set_epoch(epoch)
        curr_learning_rate = 1e-3
        # curr_learning_rate = learning_rate * pow(0.4, epoch)
        # for param_group in optimizer.param_groups:
        #     param_group['lr'] = curr_learning_rate

        tb_writer.add_scalar("learning_rate", curr_learning_rate, epoch)

        logging.info("epoch {}, learning rate {}".format(
            epoch, curr_learning_rate))
        objf, valid_objf, global_batch_idx_train = train_one_epoch(
            dataloader=train_dl,
            valid_dataloader=valid_dl,
            model=model,
            device=device,
            graph_compiler=graph_compiler,
            optimizer=optimizer,
            current_epoch=epoch,
            tb_writer=tb_writer,
            num_epochs=num_epochs,
            global_batch_idx_train=global_batch_idx_train,
        )
        # the lower, the better
        if valid_objf < best_valid_objf:
            best_valid_objf = valid_objf
            best_objf = objf
            best_epoch = epoch
            save_checkpoint(
                filename=best_model_path,
                model=model,
                epoch=epoch,
                optimizer=None,
                scheduler=None,
                learning_rate=curr_learning_rate,
                objf=objf,
                valid_objf=valid_objf,
                global_batch_idx_train=global_batch_idx_train,
            )
            save_training_info(
                filename=best_epoch_info_filename,
                model_path=best_model_path,
                current_epoch=epoch,
                learning_rate=curr_learning_rate,
                objf=best_objf,
                best_objf=best_objf,
                valid_objf=valid_objf,
                best_valid_objf=best_valid_objf,
                best_epoch=best_epoch,
            )

        # we always save the model for every epoch
        model_path = os.path.join(exp_dir, "epoch-{}.pt".format(epoch))
        save_checkpoint(
            filename=model_path,
            model=model,
            optimizer=optimizer,
            scheduler=None,
            epoch=epoch,
            learning_rate=curr_learning_rate,
            objf=objf,
            valid_objf=valid_objf,
            global_batch_idx_train=global_batch_idx_train,
        )
        epoch_info_filename = os.path.join(exp_dir,
                                           "epoch-{}-info".format(epoch))
        save_training_info(
            filename=epoch_info_filename,
            model_path=model_path,
            current_epoch=epoch,
            learning_rate=curr_learning_rate,
            objf=objf,
            best_objf=best_objf,
            valid_objf=valid_objf,
            best_valid_objf=best_valid_objf,
            best_epoch=best_epoch,
        )

    logging.warning("Done")

コード例 #18

0

ファイルを表示

def main():
    exp_dir = Path("exp-lstm-adam-ctc-musan")
    setup_logger("{}/log/log-decode".format(exp_dir), log_level="debug")

    # load L, G, symbol_table
    lang_dir = Path("data/lang_nosp")
    symbol_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
    phone_symbol_table = k2.SymbolTable.from_file(lang_dir / "phones.txt")
    phone_ids = get_phone_symbols(phone_symbol_table)
    phone_ids_with_blank = [0] + phone_ids
    ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank))

    if not os.path.exists(lang_dir / "HLG.pt"):
        print("Loading L_disambig.fst.txt")
        with open(lang_dir / "L_disambig.fst.txt") as f:
            L = k2.Fsa.from_openfst(f.read(), acceptor=False)
        print("Loading G.fst.txt")
        with open(lang_dir / "G.fst.txt") as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
        first_phone_disambig_id = find_first_disambig_symbol(
            phone_symbol_table)
        first_word_disambig_id = find_first_disambig_symbol(symbol_table)
        HLG = compile_HLG(
            L=L,
            G=G,
            H=ctc_topo,
            labels_disambig_id_start=first_phone_disambig_id,
            aux_labels_disambig_id_start=first_word_disambig_id,
        )
        torch.save(HLG.as_dict(), lang_dir / "HLG.pt")
    else:
        print("Loading pre-compiled HLG")
        d = torch.load(lang_dir / "HLG.pt")
        HLG = k2.Fsa.from_dict(d)

    # load dataset
    feature_dir = Path("exp/data")
    print("About to get test cuts")
    cuts_test = CutSet.from_file(feature_dir / "gigaspeech_cuts_TEST.jsonl.gz")

    print("About to create test dataset")
    test = K2SpeechRecognitionDataset(cuts_test)
    sampler = SingleCutSampler(cuts_test, max_frames=100000)
    print("About to create test dataloader")
    test_dl = torch.utils.data.DataLoader(test,
                                          batch_size=None,
                                          sampler=sampler,
                                          num_workers=1)

    #  if not torch.cuda.is_available():
    #  logging.error('No GPU detected!')
    #  sys.exit(-1)

    print("About to load model")
    # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N
    # device = torch.device('cuda', 1)
    device = torch.device("cuda")
    model = TdnnLstm1b(
        num_features=80,
        num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
        subsampling_factor=4,
    )

    checkpoint = os.path.join(exp_dir, "epoch-7.pt")
    load_checkpoint(checkpoint, model)
    model.to(device)
    model.eval()

    print("convert HLG to device")
    HLG = HLG.to(device)
    HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0)
    HLG.requires_grad_(False)
    print("About to decode")
    results = decode(dataloader=test_dl,
                     model=model,
                     device=device,
                     HLG=HLG,
                     symbols=symbol_table)
    s = ""
    for ref, hyp in results:
        s += f"ref={ref}\n"
        s += f"hyp={hyp}\n"
    logging.info(s)
    # compute WER
    dists = [edit_distance(r, h) for r, h in results]
    errors = {
        key: sum(dist[key] for dist in dists)
        for key in ["sub", "ins", "del", "total"]
    }
    total_words = sum(len(ref) for ref, _ in results)
    # Print Kaldi-like message:
    # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ]
    logging.info(
        f'%WER {errors["total"] / total_words:.2%} '
        f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]'
    )

コード例 #19

0

ファイルを表示

def main():
    args = get_parser().parse_args()
    dataset_parts = [args.subset, "DEV", "TEST"]

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path("/export/corpora5/gigaspeech"),
        Path("/exp/pzelasko/gigaspeech"),
    )
    musan_dir = locate_corpus(
        Path("/export/corpora5/JHU/musan"),
        Path("/export/common/data/corpora/MUSAN/musan"),
        Path("/root/fangjun/data/musan"),
    )

    output_dir = Path("exp/data")
    print("GigaSpeech manifest preparation:")
    gigaspeech_manifests = prepare_gigaspeech(
        corpus_dir=corpus_dir,
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        num_jobs=args.num_jobs,
    )

    print("Musan manifest preparation:")
    musan_cuts_path = output_dir / "cuts_musan.json.gz"
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=("music", "speech", "noise"))

    ctx_suffix = get_context_suffix(args)

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in gigaspeech_manifests.items():
            raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz"
            cuts_path = (output_dir /
                         f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz")

            if raw_cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping feature extraction."
                )
            else:
                # Note this step makes the recipe different than LibriSpeech:
                # We must filter out some utterances and remove punctuation to be consistent with Kaldi.
                print("Filtering OOV utterances from supervisions")
                manifests["supervisions"] = manifests["supervisions"].filter(
                    has_no_oov)
                print("Normalizing text in", partition)
                for sup in manifests["supervisions"]:
                    sup.text = normalize_text(sup.text)

                # Create long-recording cut manifests.
                print("Processing", partition)
                cut_set = CutSet.from_manifests(
                    recordings=manifests["recordings"],
                    supervisions=manifests["supervisions"],
                )

                # Run data augmentation that needs to be done in the time domain.
                if partition not in ["DEV", "TEST"]:
                    cut_set = (cut_set + cut_set.perturb_speed(0.9) +
                               cut_set.perturb_speed(1.1))

                cut_set.to_file(raw_cuts_path)

            if cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping cutting into sub-segments."
                )
            else:
                try:
                    # If we skipped initializing `cut_set` because it exists on disk, we'll load it.
                    # This helps us avoid re-computing the features for different variants of
                    # context windows.
                    cut_set
                except NameError:
                    print(f"Reading {partition} raw cuts from disk.")
                    cut_set = CutSet.from_file(raw_cuts_path)
                # Note this step makes the recipe different than LibriSpeech:
                # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions.
                # We cut these into smaller chunks centered around each supervision, possibly adding acoustic
                # context.
                print(
                    f"About to split {partition} raw cuts into smaller chunks."
                )
                cut_set = cut_set.trim_to_supervisions(
                    keep_overlapping=False,
                    min_duration=None
                    if args.context_window <= 0.0 else args.context_window,
                    context_direction=args.context_direction,
                )
                if partition in ["L", "XL"]:
                    # Before storing manifests in, we want to pre-shuffle them,
                    # as the sampler won't be able to do it later in an efficient manner.
                    cut_set = cut_set.shuffle()

                if args.precomputed_features:
                    # Extract the features after cutting large recordings into smaller cuts.
                    # Note: we support very efficient "chunked" feature reads with the argument
                    #       `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient
                    #       data augmentation and feature computation for long recordings yet.
                    #       Therefore, we sacrifice some storage for the ability to precompute
                    #       features on shorter chunks, without memory blow-ups.
                    cut_set = cut_set.compute_and_store_features(
                        extractor=extractor,
                        storage_path=
                        f"{output_dir}/feats_gigaspeech_{partition}",
                        # when an executor is specified, make more partitions
                        num_jobs=args.num_jobs if ex is None else 80,
                        executor=ex,
                    )

                cut_set.to_file(cuts_path)

                # Remove cut_set so the next iteration can correctly infer whether it needs to
                # load the raw cuts from disk or not.
                del cut_set

        # Now onto Musan
        if not musan_cuts_path.is_file():
            print("Extracting features for Musan")
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = (CutSet.from_manifests(recordings=combine(
                part["recordings"]
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f"{output_dir}/feats_musan",
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer,
                        ))
            musan_cuts.to_file(musan_cuts_path)