Example #1
0
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]

        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                input_strategy=(
                    OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                    if self.args.on_the_fly_feats else PrecomputedFeatures()),
                return_cuts=True,
            )
            sampler = SingleCutSampler(cuts_test,
                                       max_duration=self.args.max_duration)
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(test,
                                 batch_size=None,
                                 sampler=sampler,
                                 num_workers=1)
            test_loaders.append(test_dl)

        if is_list:
            return test_loaders
        else:
            return test_loaders[0]
Example #2
0
def test_k2_speech_recognition_on_the_fly_feature_extraction(
        k2_cut_set, use_batch_extract, fault_tolerant):
    precomputed_dataset = K2SpeechRecognitionDataset()
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(
            Fbank(FbankConfig(num_mel_bins=40)),
            use_batch_extract=use_batch_extract,
            fault_tolerant=fault_tolerant,
        ))
    sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc["inputs"])
        norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc["supervisions"]["start_frame"] ==
                batch_otf["supervisions"]["start_frame"]).all()
        assert (batch_pc["supervisions"]["num_frames"] ==
                batch_otf["supervisions"]["num_frames"]).all()
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()

        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            cuts_valid = cuts_valid.drop_features()
            validate = K2SpeechRecognitionDataset(
                cuts_valid.drop_features(),
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))))
        else:
            validate = K2SpeechRecognitionDataset(cuts_valid)
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=True,
        )
        return valid_dl
Example #4
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('dev', 'test', 'train')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French'))
    musan_dir = locate_corpus(Path('/mnt/corpora/musan'))

    output_dir = Path('exp/data')
    print('mls manifest preparation:')
    mls_manifests = prepare_mls(corpus_dir=corpus_dir,
                                output_dir=output_dir,
                                opus=False,
                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in mls_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            mls_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Example #5
0
def test_extract_and_store_features(cut):
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        cut_with_feats = cut.compute_and_store_features(extractor=extractor,
                                                        storage=storage)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 100
    assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
Example #6
0
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly):
    mixed_cut = cut.append(cut)
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        cut_with_feats = mixed_cut.compute_and_store_features(
            extractor=extractor, storage=storage, mix_eagerly=mix_eagerly)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 200
    assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
Example #7
0
 def _with_features(
     self, cut: MonoCut, frame_shift: Seconds, sampling_rate: int
 ) -> MonoCut:
     d = TemporaryDirectory()
     self.dirs.append(d)
     extractor = Fbank(
         config=FbankConfig(sampling_rate=sampling_rate, frame_shift=frame_shift)
     )
     with LilcomHdf5Writer(d.name) as storage:
         return cut.compute_and_store_features(extractor, storage=storage)
Example #8
0
def test_feature_set_builder(storage_fn):
    recordings: RecordingSet = RecordingSet.from_json(
        "test/fixtures/audio.json")
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with storage_fn() as storage:
        builder = FeatureSetBuilder(
            feature_extractor=extractor,
            storage=storage,
        )
        feature_set = builder.process_and_store_recordings(
            recordings=recordings)

    assert len(feature_set) == 6

    feature_infos = list(feature_set)

    # Assert the properties shared by all features
    for features in feature_infos:
        # assert that fbank is the default feature type
        assert features.type == "kaldi-fbank"
        # assert that duration is always a multiple of frame_shift
        assert features.num_frames == round(features.duration /
                                            features.frame_shift)
        # assert that num_features is preserved
        assert features.num_features == builder.feature_extractor.config.num_filters
        # assert that the storage type metadata matches
        assert features.storage_type == storage.name
        # assert that the metadata is consistent with the data shapes
        arr = features.load()
        assert arr.shape[0] == features.num_frames
        assert arr.shape[1] == features.num_features
        # assert that the stored features are the same as the "freshly extracted" features
        recording = recordings[features.recording_id]
        expected = extractor.extract(
            samples=recording.load_audio(channels=features.channels),
            sampling_rate=recording.sampling_rate,
        )
        np.testing.assert_almost_equal(arr, expected, decimal=2)

    # Assert the properties for recordings of duration 0.5 seconds
    for features in feature_infos[:2]:
        assert features.num_frames == 50
        assert features.duration == 0.5

    # Assert the properties for recordings of duration 1.0 seconds
    for features in feature_infos[2:]:
        assert features.num_frames == 100
        assert features.duration == 1.0
Example #9
0
    def valid_dataloaders(self) -> DataLoader:
        self.validate_args()
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()

        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(duration_factor=self.args.duration_factor,
                               gap=self.args.gap)
            ] + transforms

        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(
                    FbankConfig(num_mel_bins=80)),
                                                num_workers=8),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        # valid_dl = DataLoader(
        #    validate,
        #    sampler=valid_sampler,
        #    batch_size=None,
        #    num_workers=8,
        #    persistent_workers=True,
        # )
        valid_dl = LhotseDataLoader(
            validate,
            sampler=valid_sampler,
            num_workers=2,
        )
        return valid_dl
Example #10
0
    def __init__(
        self,
        lang_dir: Pathlike,
        scripted_model_path: Optional[Pathlike] = None,
        model_dir: Optional[Pathlike] = None,
        average_epochs: Sequence[int] = (7, 8, 9),
        device: torch.device = 'cpu',
        sampling_rate: int = 16000,
    ):
        if isinstance(device, str):
            self.device = torch.device(device)

        self.sampling_rate = sampling_rate
        self.extractor = Fbank(FbankConfig(num_mel_bins=80))
        self.lexicon = Lexicon(lang_dir)
        phone_ids = self.lexicon.phone_symbols()
        self.P = create_bigram_phone_lm(phone_ids)

        if model_dir is not None:
            # Read model from regular checkpoints, assume it's a Conformer
            self.model = Conformer(num_features=80,
                                   num_classes=len(phone_ids) + 1,
                                   num_decoder_layers=0)
            self.P.scores = torch.zeros_like(self.P.scores)
            self.model.P_scores = torch.nn.Parameter(self.P.scores.clone(),
                                                     requires_grad=False)
            average_checkpoint(filenames=[
                model_dir / f'epoch-{n}.pt' for n in average_epochs
            ],
                               model=self.model)
        elif scripted_model_path is not None:
            # Read model from a serialized TorchScript module, no assumptions needed
            self.model = torch.jit.load(scripted_model_path)
        else:
            raise ValueError(
                "One of scripted_model_path or model_dir needs to be provided."
            )

        # Freeze the params by default.
        for p in self.model.parameters():
            p.requires_grad_(False)
        self.compiler = MmiTrainingGraphCompiler(lexicon=self.lexicon,
                                                 device=self.device)
        self.HLG = k2.Fsa.from_dict(torch.load(lang_dir / 'HLG.pt')).to(
            self.device)
Example #11
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('devtest', 'test', 'train')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/mnt/corpora/LDC2006S37/data'),
    )

    output_dir = Path('exp/data')
    print('Heroico manifest preparation:')
    transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' )
    heroico_manifests = prepare_heroico(
        speech_dir=corpus_dir,
        transcript_dir=transcripts_dir,
        output_dir=output_dir,
    )

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in heroico_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions']
            )
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            heroico_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Example #12
0
def main():
    args = get_parser().parse_args()

    corpus_dir = locate_corpus(
        Path("/export/corpora5/AMI/amicorpus"),
    )
    annotations_dir = Path("/export/c07/draj")

    download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm")

    output_dir = Path("exp/data")

    print("AMI manifest preparation:")
    ami_manifests = prepare_ami(
        corpus_dir,
        annotations_dir=annotations_dir,
        output_dir=output_dir,
        mic="sdm",
        partition="full-corpus",
        max_pause=0,
    )

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in ami_manifests.items():
            if (output_dir / f"cuts_{partition}.json.gz").is_file():
                print(f"{partition} already exists - skipping.")
                continue
            print("Processing", partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests["recordings"],
                supervisions=manifests["supervisions"],
            ).cut_into_windows(duration=5)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)),
                executor=ex,
                storage_type=LilcomHdf5Writer,
            ).pad(duration=5.0)
            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
Example #13
0
def test_on_the_fly_feature_extraction_unsupervised_dataset(libri_cut_set):
    ref_dataset = UnsupervisedDataset()
    tested_dataset = DynamicUnsupervisedDataset(feature_extractor=Fbank(
        FbankConfig(num_mel_bins=40)), )
    out = ref_dataset[libri_cut_set]
    ref_feats = out["features"]
    tested_feats = tested_dataset[libri_cut_set]
    # Note: comparison to 1 decimal fails.
    #       I'm assuming this is due to lilcom's compression.
    #       Pytest outputs looks like the following:
    # E       Mismatched elements: 4 / 23000 (0.0174%)
    # E       Max absolute difference: 0.46469784
    # E       Max relative difference: 0.6171043
    # E        x: array([[-11.5, -11.4,  -9.9, ...,  -5.5,  -6.5,  -7.4],
    # E              [-13.2, -11.2,  -9.6, ...,  -5.6,  -6.5,  -7.6],
    # E              [-12. , -10.1, -10.1, ...,  -5.8,  -7. ,  -7.8],...
    # E        y: array([[-11.5, -11.4,  -9.9, ...,  -5.5,  -6.5,  -7.4],
    # E              [-13.2, -11.2,  -9.6, ...,  -5.6,  -6.5,  -7.6],
    # E              [-12. , -10.1, -10.1, ...,  -5.8,  -7. ,  -7.8],...
    np.testing.assert_array_almost_equal(ref_feats, tested_feats, decimal=0)
Example #14
0
def test_extract_and_store_features_from_cut_set(cut_set, executor, num_jobs,
                                                 storage_type, mix_eagerly):
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir:
        cut_set_with_feats = cut_set.compute_and_store_features(
            extractor=extractor,
            storage_path=tmpdir,
            num_jobs=num_jobs,
            mix_eagerly=mix_eagerly,
            executor=executor() if executor else None,
            storage_type=storage_type,
        ).sort_by_duration(
        )  # sort by duration to ensure the same order of cuts

        # The same number of cuts
        assert len(cut_set_with_feats) == 2

        for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats):
            # The ID is retained
            assert orig_cut.id == feat_cut.id
            # Features were attached
            assert feat_cut.has_features
            # Recording is retained unless mixing a MixedCut eagerly
            should_have_recording = not (mix_eagerly
                                         and isinstance(orig_cut, MixedCut))
            assert feat_cut.has_recording == should_have_recording

        cuts = list(cut_set_with_feats)

        arr = cuts[0].load_features()
        assert arr.shape[0] == 300
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)

        arr = cuts[1].load_features()
        assert arr.shape[0] == 100
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
Example #15
0
        frame_length=frame_length,
        frame_shift=frame_shift) == expected_num_frames)


def test_add_feature_sets():
    expected = DummyManifest(FeatureSet, begin_id=0, end_id=10)
    feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5)
    feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10)
    combined = feature_set_1 + feature_set_2
    assert combined == expected


@pytest.mark.parametrize(
    ["feature_extractor", "decimal", "exception_expectation"],
    [
        (Fbank(FbankConfig(num_filters=40,
                           sampling_rate=8000)), 0, does_not_raise()),
        (Spectrogram(), -1, does_not_raise()),
        (Mfcc(MfccConfig(sampling_rate=8000)), None, raises(ValueError)),
    ],
)
def test_mixer(feature_extractor, decimal, exception_expectation):
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 8000
    t = np.linspace(0, 1, 8000, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)
    x2 = np.sin(55.0 * t).reshape(1, -1)

    f1 = feature_extractor.extract(x1, sr)
    f2 = feature_extractor.extract(x2, sr)
    with exception_expectation:
        mixer = FeatureMixer(
Example #16
0
    def train_dataloaders(self) -> DataLoader:
        logging.info("About to get train cuts")
        cuts_train = self.train_cuts()

        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(self.args.feature_dir /
                                   'cuts_musan.json.gz')

        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
        if self.args.concatenate_cuts:
            logging.info(
                f'Using cut concatenation with duration factor '
                f'{self.args.duration_factor} and gap {self.args.gap}.')
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between different utterances.
            transforms = [
                CutConcatenate(duration_factor=self.args.duration_factor,
                               gap=self.args.gap)
            ] + transforms

        input_transforms = [
            SpecAugment(num_frame_masks=2,
                        features_mask_size=27,
                        num_feature_masks=2,
                        frames_mask_size=100)
        ]

        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=True,
        )

        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage.
            # # Add on-the-fly speed perturbation; since originally it would have increased epoch
            # # size by 3, we will apply prob 2/3 and use 3x more epochs.
            # # Speed perturbation probably should come first before concatenation,
            # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms
            # Drop feats to be on the safe side.
            cuts_train = cuts_train.drop_features()
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))),
                input_transforms=input_transforms,
                return_cuts=True,
            )

        if self.args.bucketing_sampler:
            logging.info('Using BucketingSampler.')
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets)
        else:
            logging.info('Using SingleCutSampler.')
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=4,
            persistent_workers=True,
        )
        return train_dl
Example #17
0
def main():
    args = get_parser().parse_args()

    model_type = args.model_type
    epoch = args.epoch
    max_duration = args.max_duration
    avg = args.avg
    att_rate = args.att_rate

    exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa')
    setup_logger('{}/log/log-decode'.format(exp_dir), log_level='debug')

    # load L, G, symbol_table
    lang_dir = Path('data/lang_nosp')
    symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt')
    phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt')

    phone_ids = get_phone_symbols(phone_symbol_table)
    phone_ids_with_blank = [0] + phone_ids
    ctc_topo = k2.arc_sort(build_ctc_topo(phone_ids_with_blank))

    logging.debug("About to load model")
    # Note: Use "export CUDA_VISIBLE_DEVICES=N" to setup device id to N
    # device = torch.device('cuda', 1)
    device = torch.device('cuda')

    if att_rate != 0.0:
        num_decoder_layers = 6
    else:
        num_decoder_layers = 0

    if model_type == "transformer":
        model = Transformer(
            num_features=80,
            nhead=args.nhead,
            d_model=args.attention_dim,
            num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
            subsampling_factor=4,
            num_decoder_layers=num_decoder_layers)
    else:
        model = Conformer(
            num_features=80,
            nhead=args.nhead,
            d_model=args.attention_dim,
            num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
            subsampling_factor=4,
            num_decoder_layers=num_decoder_layers)

    if avg == 1:
        checkpoint = os.path.join(exp_dir, 'epoch-' + str(epoch - 1) + '.pt')
        load_checkpoint(checkpoint, model)
    else:
        checkpoints = [
            os.path.join(exp_dir, 'epoch-' + str(avg_epoch) + '.pt')
            for avg_epoch in range(epoch - avg, epoch)
        ]
        average_checkpoint(checkpoints, model)

    model.to(device)
    model.eval()

    if not os.path.exists(lang_dir / 'HLG.pt'):
        logging.debug("Loading L_disambig.fst.txt")
        with open(lang_dir / 'L_disambig.fst.txt') as f:
            L = k2.Fsa.from_openfst(f.read(), acceptor=False)
        logging.debug("Loading G.fst.txt")
        with open(lang_dir / 'G.fst.txt') as f:
            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
        first_phone_disambig_id = find_first_disambig_symbol(
            phone_symbol_table)
        first_word_disambig_id = find_first_disambig_symbol(symbol_table)
        HLG = compile_HLG(L=L,
                          G=G,
                          H=ctc_topo,
                          labels_disambig_id_start=first_phone_disambig_id,
                          aux_labels_disambig_id_start=first_word_disambig_id)
        torch.save(HLG.as_dict(), lang_dir / 'HLG.pt')
    else:
        logging.debug("Loading pre-compiled HLG")
        d = torch.load(lang_dir / 'HLG.pt')
        HLG = k2.Fsa.from_dict(d)

    logging.debug("convert HLG to device")
    HLG = HLG.to(device)
    HLG.aux_labels = k2.ragged.remove_values_eq(HLG.aux_labels, 0)
    HLG.requires_grad_(False)

    # load dataset
    feature_dir = Path('exp/data')
    test_sets = ['test-clean', 'test-other']
    for test_set in test_sets:
        logging.info(f'* DECODING: {test_set}')

        logging.debug("About to get test cuts")
        cuts_test = load_manifest(feature_dir / f'cuts_{test_set}.json.gz')
        logging.debug("About to create test dataset")
        from lhotse.dataset.input_strategies import OnTheFlyFeatures
        from lhotse import Fbank, FbankConfig
        test = K2SpeechRecognitionDataset(
            cuts_test,
            input_strategy=OnTheFlyFeatures(Fbank(
                FbankConfig(num_mel_bins=80))))
        sampler = SingleCutSampler(cuts_test, max_duration=max_duration)
        logging.debug("About to create test dataloader")
        test_dl = torch.utils.data.DataLoader(test,
                                              batch_size=None,
                                              sampler=sampler,
                                              num_workers=1)

        logging.debug("About to decode")
        results = decode(dataloader=test_dl,
                         model=model,
                         device=device,
                         HLG=HLG,
                         symbols=symbol_table)

        recog_path = exp_dir / f'recogs-{test_set}.txt'
        store_transcripts(path=recog_path, texts=results)
        logging.info(f'The transcripts are stored in {recog_path}')
        # compute WER
        dists = [edit_distance(r, h) for r, h in results]
        errors = {
            key: sum(dist[key] for dist in dists)
            for key in ['sub', 'ins', 'del', 'total']
        }
        total_words = sum(len(ref) for ref, _ in results)
        # Print Kaldi-like message:
        # %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ]
        logging.info(
            f'[{test_set}] %WER {errors["total"] / total_words:.2%} '
            f'[{errors["total"]} / {total_words}, {errors["ins"]} ins, {errors["del"]} del, {errors["sub"]} sub ]'
        )
Example #18
0
 def _with_features(self, cut: Cut, frame_shift: Seconds) -> Cut:
     d = TemporaryDirectory()
     self.dirs.append(d)
     extractor = Fbank(config=FbankConfig(frame_shift=frame_shift))
     with LilcomFilesWriter(d.name) as storage:
         return cut.compute_and_store_features(extractor, storage=storage)
Example #19
0
def main():
    args = get_parser().parse_args()
    if args.full_libri:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100', 'train-clean-360',
                         'train-other-500')
    else:
        dataset_parts = ('dev-clean', 'dev-other', 'test-clean', 'test-other',
                         'train-clean-100')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        ), Path('/root/fangjun/data/librispeech/LibriSpeech'),
        Path('/export/common/data/corpora/ASR/openslr/SLR12/LibriSpeech'))
    musan_dir = locate_corpus(
        Path('/export/corpora5/JHU/musan'),
        Path('/export/common/data/corpora/MUSAN/musan'),
        Path('/root/fangjun/data/musan'),
    )

    output_dir = Path('exp/data')
    print('LibriSpeech manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Example #20
0
        frame_length=frame_length,
        frame_shift=frame_shift) == expected_num_frames)


def test_add_feature_sets():
    expected = DummyManifest(FeatureSet, begin_id=0, end_id=10)
    feature_set_1 = DummyManifest(FeatureSet, begin_id=0, end_id=5)
    feature_set_2 = DummyManifest(FeatureSet, begin_id=5, end_id=10)
    combined = feature_set_1 + feature_set_2
    assert combined == expected


@pytest.mark.parametrize(
    ["feature_extractor", "decimal", "exception_expectation"],
    [
        (Fbank(FbankConfig(num_mel_bins=40)), 0, does_not_raise()),
        (Spectrogram(), -1, does_not_raise()),
        (Mfcc(), None, raises(ValueError)),
    ],
)
def test_mixer(feature_extractor, decimal, exception_expectation):
    # Treat it more like a test of "it runs" rather than "it works"
    sr = 8000
    t = np.linspace(0, 1, 8000, dtype=np.float32)
    x1 = np.sin(440.0 * t).reshape(1, -1)
    x2 = np.sin(55.0 * t).reshape(1, -1)

    f1 = feature_extractor.extract(x1, sr)
    f2 = feature_extractor.extract(x2, sr)
    with exception_expectation:
        mixer = FeatureMixer(
Example #21
0
def test_extract_features(cut):
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    arr = cut.compute_features(extractor=extractor)
    assert arr.shape[0] == 100
    assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
Example #22
0
def main():
    args = get_parser().parse_args()
    dataset_parts = [args.subset, "DEV", "TEST"]

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path("/export/corpora5/gigaspeech"),
        Path("/exp/pzelasko/gigaspeech"),
    )
    musan_dir = locate_corpus(
        Path("/export/corpora5/JHU/musan"),
        Path("/export/common/data/corpora/MUSAN/musan"),
        Path("/root/fangjun/data/musan"),
    )

    output_dir = Path("exp/data")
    print("GigaSpeech manifest preparation:")
    gigaspeech_manifests = prepare_gigaspeech(
        corpus_dir=corpus_dir,
        dataset_parts=dataset_parts,
        output_dir=output_dir,
        num_jobs=args.num_jobs,
    )

    print("Musan manifest preparation:")
    musan_cuts_path = output_dir / "cuts_musan.json.gz"
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=("music", "speech", "noise"))

    ctx_suffix = get_context_suffix(args)

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in gigaspeech_manifests.items():
            raw_cuts_path = output_dir / f"gigaspeech_cuts_{partition}_raw.jsonl.gz"
            cuts_path = (output_dir /
                         f"gigaspeech_cuts_{partition}{ctx_suffix}.jsonl.gz")

            if raw_cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping feature extraction."
                )
            else:
                # Note this step makes the recipe different than LibriSpeech:
                # We must filter out some utterances and remove punctuation to be consistent with Kaldi.
                print("Filtering OOV utterances from supervisions")
                manifests["supervisions"] = manifests["supervisions"].filter(
                    has_no_oov)
                print("Normalizing text in", partition)
                for sup in manifests["supervisions"]:
                    sup.text = normalize_text(sup.text)

                # Create long-recording cut manifests.
                print("Processing", partition)
                cut_set = CutSet.from_manifests(
                    recordings=manifests["recordings"],
                    supervisions=manifests["supervisions"],
                )

                # Run data augmentation that needs to be done in the time domain.
                if partition not in ["DEV", "TEST"]:
                    cut_set = (cut_set + cut_set.perturb_speed(0.9) +
                               cut_set.perturb_speed(1.1))

                cut_set.to_file(raw_cuts_path)

            if cuts_path.is_file():
                print(
                    f"{partition} already exists - skipping cutting into sub-segments."
                )
            else:
                try:
                    # If we skipped initializing `cut_set` because it exists on disk, we'll load it.
                    # This helps us avoid re-computing the features for different variants of
                    # context windows.
                    cut_set
                except NameError:
                    print(f"Reading {partition} raw cuts from disk.")
                    cut_set = CutSet.from_file(raw_cuts_path)
                # Note this step makes the recipe different than LibriSpeech:
                # Since recordings are long, the initial CutSet has very long cuts with a plenty of supervisions.
                # We cut these into smaller chunks centered around each supervision, possibly adding acoustic
                # context.
                print(
                    f"About to split {partition} raw cuts into smaller chunks."
                )
                cut_set = cut_set.trim_to_supervisions(
                    keep_overlapping=False,
                    min_duration=None
                    if args.context_window <= 0.0 else args.context_window,
                    context_direction=args.context_direction,
                )
                if partition in ["L", "XL"]:
                    # Before storing manifests in, we want to pre-shuffle them,
                    # as the sampler won't be able to do it later in an efficient manner.
                    cut_set = cut_set.shuffle()

                if args.precomputed_features:
                    # Extract the features after cutting large recordings into smaller cuts.
                    # Note: we support very efficient "chunked" feature reads with the argument
                    #       `storage_type=ChunkedLilcomHdf5Writer`, but we don't support efficient
                    #       data augmentation and feature computation for long recordings yet.
                    #       Therefore, we sacrifice some storage for the ability to precompute
                    #       features on shorter chunks, without memory blow-ups.
                    cut_set = cut_set.compute_and_store_features(
                        extractor=extractor,
                        storage_path=
                        f"{output_dir}/feats_gigaspeech_{partition}",
                        # when an executor is specified, make more partitions
                        num_jobs=args.num_jobs if ex is None else 80,
                        executor=ex,
                    )

                cut_set.to_file(cuts_path)

                # Remove cut_set so the next iteration can correctly infer whether it needs to
                # load the raw cuts from disk or not.
                del cut_set

        # Now onto Musan
        if not musan_cuts_path.is_file():
            print("Extracting features for Musan")
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = (CutSet.from_manifests(recordings=combine(
                part["recordings"]
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f"{output_dir}/feats_musan",
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer,
                        ))
            musan_cuts.to_file(musan_cuts_path)