Example #1
0
def test_extract_and_store_features_from_cut_set(cut_set, executor,
                                                 mix_eagerly):
    extractor = Fbank()
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        with executor() if executor is not None else no_executor() as ex:
            cut_set_with_feats = cut_set.compute_and_store_features(
                extractor=extractor,
                storage=storage,
                mix_eagerly=mix_eagerly,
                executor=ex)

        # The same number of cuts
        assert len(cut_set_with_feats) == 2

        for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats):
            # The ID is retained
            assert orig_cut.id == feat_cut.id
            # Features were attached
            assert feat_cut.has_features
            # Recording is retained unless mixing a MixedCut eagerly
            should_have_recording = not (mix_eagerly
                                         and isinstance(orig_cut, MixedCut))
            assert feat_cut.has_recording == should_have_recording

        cuts = list(cut_set_with_feats)

        arr = cuts[0].load_features()
        assert arr.shape[0] == 100
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)

        arr = cuts[1].load_features()
        assert arr.shape[0] == 300
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
def test_extract_and_store_features(cut):
    extractor = Fbank()
    with TemporaryDirectory() as tmpdir:
        cut_with_feats = cut.compute_and_store_features(extractor=extractor, output_dir=tmpdir)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 100
    assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
Example #3
0
def test_extract_and_store_features_from_cut_set(cut_set, executor, num_jobs,
                                                 storage_type, mix_eagerly):
    extractor = Fbank()
    with TemporaryDirectory() as tmpdir:
        cut_set_with_feats = cut_set.compute_and_store_features(
            extractor=extractor,
            storage_path=tmpdir,
            num_jobs=num_jobs,
            mix_eagerly=mix_eagerly,
            executor=executor() if executor else None,
            storage_type=storage_type).sort_by_duration(
            )  # sort by duration to ensure the same order of cuts

        # The same number of cuts
        assert len(cut_set_with_feats) == 2

        for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats):
            # The ID is retained
            assert orig_cut.id == feat_cut.id
            # Features were attached
            assert feat_cut.has_features
            # Recording is retained unless mixing a MixedCut eagerly
            should_have_recording = not (mix_eagerly
                                         and isinstance(orig_cut, MixedCut))
            assert feat_cut.has_recording == should_have_recording

        cuts = list(cut_set_with_feats)

        arr = cuts[0].load_features()
        assert arr.shape[0] == 300
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)

        arr = cuts[1].load_features()
        assert arr.shape[0] == 100
        assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
Example #4
0
def main():
    corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Aishell data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('aishell manifest preparation:')
    aishell_manifests = prepare_aishell(corpus_dir=corpus_dir,
                                        output_dir=output_dir)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan',
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                storage_path=f'{output_dir}/feats_{partition}',
                num_jobs=num_jobs if ex is not None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=Fbank(),
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Example #5
0
def test_extract_and_store_features(cut):
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        cut_with_feats = cut.compute_and_store_features(extractor=extractor,
                                                        storage=storage)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 100
    assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
Example #6
0
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly):
    mixed_cut = cut.append(cut)
    extractor = Fbank(FbankConfig(sampling_rate=8000))
    with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage:
        cut_with_feats = mixed_cut.compute_and_store_features(
            extractor=extractor, storage=storage, mix_eagerly=mix_eagerly)
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 200
    assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly):
    mixed_cut = cut.append(cut)
    extractor = Fbank()
    with TemporaryDirectory() as tmpdir:
        cut_with_feats = mixed_cut.compute_and_store_features(
            extractor=extractor,
            output_dir=tmpdir,
            mix_eagerly=mix_eagerly
        )
        arr = cut_with_feats.load_features()
    assert arr.shape[0] == 200
    assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing(
    k2_cut_set, ):
    dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(extractor=Fbank(), ))
    rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures(
        extractor=Fbank(),
        # Use p=1.0 to ensure that smoothing is applied in this test.
        wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)],
    ))
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch = dataset[cut_ids]
        rs_batch = rs_dataset[cut_ids]
        # Additive noise should cause the energies to go up
        assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
Example #9
0
def test_k2_speech_recognition_on_the_fly_feature_extraction(
        k2_cut_set, use_batch_extract, fault_tolerant):
    precomputed_dataset = K2SpeechRecognitionDataset()
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        input_strategy=OnTheFlyFeatures(
            Fbank(FbankConfig(num_mel_bins=40)),
            use_batch_extract=use_batch_extract,
            fault_tolerant=fault_tolerant,
        ))
    sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc["inputs"])
        norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc["supervisions"]["start_frame"] ==
                batch_otf["supervisions"]["start_frame"]).all()
        assert (batch_pc["supervisions"]["num_frames"] ==
                batch_otf["supervisions"]["num_frames"]).all()
Example #10
0
def test_cut_set_batch_feature_extraction_resume(cut_set, overwrite):
    # This test checks that we can keep writing to the same file
    # and the previously written results are not lost.
    # Since we don't have an easy way to interrupt the execution in a test,
    # we just write another CutSet to the same file.
    # The effect is the same.
    extractor = Fbank()
    cut_set = cut_set.resample(16000)
    subsets = cut_set.split(num_splits=2)
    processed = []
    with NamedTemporaryFile() as feat_f, NamedTemporaryFile(
            suffix=".jsonl.gz") as manifest_f:
        for cuts in subsets:
            processed.append(
                cuts.compute_and_store_features_batch(
                    extractor=extractor,
                    storage_path=feat_f.name,
                    manifest_path=manifest_f.name,
                    num_workers=0,
                    overwrite=overwrite,
                ))
        feat_f.flush()
        manifest_f.flush()
        merged = load_manifest(manifest_f.name)
        if overwrite:
            assert list(merged.ids) == list(subsets[-1].ids)
        else:
            assert list(merged.ids) == list(cut_set.ids)
        validate(merged, read_data=True)
Example #11
0
    def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]:
        cuts = self.test_cuts()
        is_list = isinstance(cuts, list)
        test_loaders = []
        if not is_list:
            cuts = [cuts]

        for cuts_test in cuts:
            logging.debug("About to create test dataset")
            test = K2SpeechRecognitionDataset(
                input_strategy=(
                    OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                    if self.args.on_the_fly_feats else PrecomputedFeatures()),
                return_cuts=True,
            )
            sampler = SingleCutSampler(cuts_test,
                                       max_duration=self.args.max_duration)
            logging.debug("About to create test dataloader")
            test_dl = DataLoader(test,
                                 batch_size=None,
                                 sampler=sampler,
                                 num_workers=1)
            test_loaders.append(test_dl)

        if is_list:
            return test_loaders
        else:
            return test_loaders[0]
    def valid_dataloaders(self) -> DataLoader:
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()

        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            cuts_valid = cuts_valid.drop_features()
            validate = K2SpeechRecognitionDataset(
                cuts_valid.drop_features(),
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))))
        else:
            validate = K2SpeechRecognitionDataset(cuts_valid)
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=True,
        )
        return valid_dl
Example #13
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('dev', 'test', 'train')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French'))
    musan_dir = locate_corpus(Path('/mnt/corpora/musan'))

    output_dir = Path('exp/data')
    print('mls manifest preparation:')
    mls_manifests = prepare_mls(corpus_dir=corpus_dir,
                                output_dir=output_dir,
                                opus=False,
                                num_jobs=args.num_jobs)

    print('Musan manifest preparation:')
    musan_cuts_path = output_dir / 'cuts_musan.json.gz'
    musan_manifests = prepare_musan(corpus_dir=musan_dir,
                                    output_dir=output_dir,
                                    parts=('music', 'speech', 'noise'))

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in mls_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer)
            mls_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
        # Now onto Musan
        if not musan_cuts_path.is_file():
            print('Extracting features for Musan')
            # create chunks of Musan with duration 5 - 10 seconds
            musan_cuts = CutSet.from_manifests(recordings=combine(
                part['recordings']
                for part in musan_manifests.values())).cut_into_windows(
                    10.0).filter(
                        lambda c: c.duration > 5).compute_and_store_features(
                            extractor=extractor,
                            storage_path=f'{output_dir}/feats_musan',
                            num_jobs=args.num_jobs if ex is None else 80,
                            executor=ex,
                            storage_type=LilcomHdf5Writer)
            musan_cuts.to_json(musan_cuts_path)
Example #14
0
 def _with_features(
     self, cut: MonoCut, frame_shift: Seconds, sampling_rate: int
 ) -> MonoCut:
     d = TemporaryDirectory()
     self.dirs.append(d)
     extractor = Fbank(
         config=FbankConfig(sampling_rate=sampling_rate, frame_shift=frame_shift)
     )
     with LilcomHdf5Writer(d.name) as storage:
         return cut.compute_and_store_features(extractor, storage=storage)
Example #15
0
 def test_wav_augment_with_executor(self, exec_type):
     cut = self.with_cut(sampling_rate=16000, num_samples=16000)
     with TemporaryDirectory() as d, \
             exec_type(max_workers=4) as ex:
         cut_set = CutSet.from_cuts(
             cut.with_id(str(i)) for i in range(100)).perturb_speed(
                 1.1
             )  # perturb_speed uses torchaudio SoX effect that could hang
         # Just test that it runs and does not hang.
         cut_set_feats = cut_set.compute_and_store_features(
             extractor=Fbank(), storage_path=d, executor=ex)
Example #16
0
 def test_wav_augment_with_executor(self, exec_type):
     cut = self.with_cut(sampling_rate=16000, num_samples=16000)
     with TemporaryDirectory() as d, \
             LilcomFilesWriter(storage_path=d) as storage, \
             exec_type(max_workers=4) as ex:
         cut_set = CutSet.from_cuts(cut.with_id(str(i)) for i in range(100))
         # Just test that it runs and does not hang.
         cut_set_feats = cut_set.compute_and_store_features(
             extractor=Fbank(),
             storage=storage,
             augment_fn=SoxEffectTransform(speed(16000)),
             executor=ex)
Example #17
0
def test_cut_set_batch_feature_extraction_manifest_path(
        cut_set, suffix, exception_expectation):
    extractor = Fbank()
    cut_set = cut_set.resample(16000)
    with NamedTemporaryFile() as feat_f, NamedTemporaryFile(
            suffix=suffix) as manifest_f:
        with exception_expectation:
            cut_set_with_feats = cut_set.compute_and_store_features_batch(
                extractor=extractor,
                storage_path=feat_f.name,
                manifest_path=manifest_f.name,
                num_workers=0,
            )
            validate(cut_set_with_feats, read_data=True)
Example #18
0
def test_on_the_fly_feats_return_audio(cut_set):
    from lhotse.dataset import OnTheFlyFeatures

    extractor = OnTheFlyFeatures(extractor=Fbank(), return_audio=True)
    cut_set = cut_set.resample(16000)
    feats, feat_lens, audios, audio_lens = extractor(cut_set)
    assert isinstance(feats, torch.Tensor)
    assert isinstance(feat_lens, torch.Tensor)
    assert isinstance(audios, torch.Tensor)
    assert isinstance(audio_lens, torch.Tensor)

    assert feats.shape == (2, 300, 80)
    assert feat_lens.shape == (2, )
    assert audios.shape == (2, 48000)
    assert audio_lens.shape == (2, )
Example #19
0
def main():
    dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100')
    print("Parts we will prepare: ", dataset_parts)

    corpus_dirs = [
        Path('/export/corpora5/LibriSpeech'),
        Path(
            '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech'
        )
    ]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Librispeech data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('Manifest preparation:')
    librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir,
                                                dataset_parts=dataset_parts,
                                                output_dir=output_dir,
                                                num_jobs=num_jobs)

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in librispeech_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                executor=ex,
                storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}'))
            librispeech_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Example #20
0
    def __init__(
        self,
        lang_dir: Pathlike,
        scripted_model_path: Optional[Pathlike] = None,
        model_dir: Optional[Pathlike] = None,
        average_epochs: Sequence[int] = (7, 8, 9),
        device: torch.device = 'cpu',
        sampling_rate: int = 16000,
    ):
        if isinstance(device, str):
            self.device = torch.device(device)

        self.sampling_rate = sampling_rate
        self.extractor = Fbank(FbankConfig(num_mel_bins=80))
        self.lexicon = Lexicon(lang_dir)
        phone_ids = self.lexicon.phone_symbols()
        self.P = create_bigram_phone_lm(phone_ids)

        if model_dir is not None:
            # Read model from regular checkpoints, assume it's a Conformer
            self.model = Conformer(num_features=80,
                                   num_classes=len(phone_ids) + 1,
                                   num_decoder_layers=0)
            self.P.scores = torch.zeros_like(self.P.scores)
            self.model.P_scores = torch.nn.Parameter(self.P.scores.clone(),
                                                     requires_grad=False)
            average_checkpoint(filenames=[
                model_dir / f'epoch-{n}.pt' for n in average_epochs
            ],
                               model=self.model)
        elif scripted_model_path is not None:
            # Read model from a serialized TorchScript module, no assumptions needed
            self.model = torch.jit.load(scripted_model_path)
        else:
            raise ValueError(
                "One of scripted_model_path or model_dir needs to be provided."
            )

        # Freeze the params by default.
        for p in self.model.parameters():
            p.requires_grad_(False)
        self.compiler = MmiTrainingGraphCompiler(lexicon=self.lexicon,
                                                 device=self.device)
        self.HLG = k2.Fsa.from_dict(torch.load(lang_dir / 'HLG.pt')).to(
            self.device)
Example #21
0
    def valid_dataloaders(self) -> DataLoader:
        self.validate_args()
        logging.info("About to get dev cuts")
        cuts_valid = self.valid_cuts()

        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(duration_factor=self.args.duration_factor,
                               gap=self.args.gap)
            ] + transforms

        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(
                    FbankConfig(num_mel_bins=80)),
                                                num_workers=8),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = SingleCutSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        # valid_dl = DataLoader(
        #    validate,
        #    sampler=valid_sampler,
        #    batch_size=None,
        #    num_workers=8,
        #    persistent_workers=True,
        # )
        valid_dl = LhotseDataLoader(
            validate,
            sampler=valid_sampler,
            num_workers=2,
        )
        return valid_dl
Example #22
0
def test_padded_cut_num_frames_and_samples_are_consistent(
        sampling_rate, num_samples, padded_duration):
    with make_cut(sampling_rate, num_samples) as cut, \
            TemporaryDirectory() as dir, \
            LilcomFilesWriter(dir) as storage:
        cut = cut.compute_and_store_features(extractor=Fbank(),
                                             storage=storage)
        cut = cut.pad(padded_duration)
        feats = cut.load_features()
        samples = cut.load_audio()

        assert cut.has_features
        assert feats.shape[0] == cut.num_frames
        assert feats.shape[1] == cut.num_features

        assert cut.has_recording
        assert samples.shape[0] == 1
        assert samples.shape[1] == cut.num_samples
Example #23
0
def main():
    args = get_parser().parse_args()
    dataset_parts = ('devtest', 'test', 'train')

    print("Parts we will prepare: ", dataset_parts)

    corpus_dir = locate_corpus(
        Path('/mnt/corpora/LDC2006S37/data'),
    )

    output_dir = Path('exp/data')
    print('Heroico manifest preparation:')
    transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' )
    heroico_manifests = prepare_heroico(
        speech_dir=corpus_dir,
        transcript_dir=transcripts_dir,
        output_dir=output_dir,
    )

    print('Feature extraction:')
    extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in heroico_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions']
            )
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f'{output_dir}/feats_{partition}',
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomHdf5Writer
            )
            heroico_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Example #24
0
def main():
    args = get_parser().parse_args()

    corpus_dir = locate_corpus(
        Path("/export/corpora5/AMI/amicorpus"),
    )
    annotations_dir = Path("/export/c07/draj")

    download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm")

    output_dir = Path("exp/data")

    print("AMI manifest preparation:")
    ami_manifests = prepare_ami(
        corpus_dir,
        annotations_dir=annotations_dir,
        output_dir=output_dir,
        mic="sdm",
        partition="full-corpus",
        max_pause=0,
    )

    print("Feature extraction:")
    extractor = Fbank(FbankConfig(num_mel_bins=80))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in ami_manifests.items():
            if (output_dir / f"cuts_{partition}.json.gz").is_file():
                print(f"{partition} already exists - skipping.")
                continue
            print("Processing", partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests["recordings"],
                supervisions=manifests["supervisions"],
            ).cut_into_windows(duration=5)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)),
                executor=ex,
                storage_type=LilcomHdf5Writer,
            ).pad(duration=5.0)
            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set):
    precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set)
    on_the_fly_dataset = K2SpeechRecognitionDataset(
        k2_cut_set.drop_features(),
        input_strategy=OnTheFlyFeatures(Fbank())
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1)
    for cut_ids in sampler:
        batch_pc = precomputed_dataset[cut_ids]
        batch_otf = on_the_fly_dataset[cut_ids]

        # Check that the features do not differ too much.
        norm_pc = torch.linalg.norm(batch_pc['inputs'])
        norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs'])
        # The precomputed and on-the-fly features are different due to mixing in time/fbank domains
        # and lilcom compression.
        assert norm_diff < 0.01 * norm_pc

        # Check that the supervision boundaries are the same.
        assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all()
        assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
def test_mixed_cut_num_frames_example_1():
    fbank = Fbank()
    with make_cut(sampling_rate=16000, num_samples=237920) as cut1, \
            make_cut(sampling_rate=16000, num_samples=219600) as cut2, \
            TemporaryDirectory() as d, \
            LilcomFilesWriter(d) as storage:
        # These are two cuts of similar duration, concatenated together with 1 second of silence
        # in between, and padded to duration of 31.445.
        mixed: MixedCut = (cut1.compute_and_store_features(
            fbank, storage).pad(duration=cut1.duration + 1.0).append(
                cut2.compute_and_store_features(fbank,
                                                storage)).pad(duration=31.445))
        assert mixed.duration == 31.445  # Padded correctly
        assert mixed.num_frames == 3145  # Round last 5 up
        assert sum(
            t.cut.num_frames for t in mixed.tracks
        ) == 3145  # Since the tracks do not overlap in this example,
        # The sum of individual cut num_frames should be equal to the total num_frames
        features = mixed.load_features()
        assert features.shape[
            0] == 3145  # Loaded features num frames matches the meta-data
def test_mixed_cut_num_frames_example_2():
    fbank = Fbank()
    with make_cut(sampling_rate=16000, num_samples=252879) as cut1, \
            make_cut(sampling_rate=16000, num_samples=185280) as cut2, \
            make_cut(sampling_rate=16000, num_samples=204161) as cut3, \
            TemporaryDirectory() as d, \
            LilcomFilesWriter(d) as storage:
        # These are two cuts of similar duration, concatenated together with 1 second of silence
        # in between, and padded to duration of 31.445.
        mixed: MixedCut = (cut1.compute_and_store_features(
            fbank, storage).pad(duration=cut1.duration + 1.0).append(
                cut2.compute_and_store_features(fbank, storage)))
        mixed = (mixed.pad(duration=mixed.duration + 1.0).append(
            cut3.compute_and_store_features(fbank, storage)))
        assert mixed.duration == 42.145  # Padded correctly
        assert mixed.num_frames == 4215  # Round last 5 up
        # TODO(pzelasko): This assertion would not pass for now, as we're adding an extra frame during load_features.
        # assert sum(t.cut.num_frames for t in mixed.tracks) == 4215  # Since the tracks do not overlap in this example,
        # The sum of individual cut num_frames should be equal to the total num_frames
        features = mixed.load_features()
        assert features.shape[
            0] == 4215  # Loaded features num frames matches the meta-data
Example #28
0
def main():
    corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')]
    corpus_dir = None
    for d in corpus_dirs:
        if os.path.exists(d):
            corpus_dir = d
    if corpus_dir is None:
        print(
            "Please create a place on your system to put the downloaded Aishell data "
            "and add it to `corpus_dirs`")
        sys.exit(1)

    output_dir = Path('exp/data')
    print('Manifest preparation:')
    aishell_manifests = prepare_aishell(corpus_dir=corpus_dir,
                                        output_dir=output_dir)

    print('Feature extraction:')
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, manifests in aishell_manifests.items():
            if (output_dir / f'cuts_{partition}.json.gz').is_file():
                print(f'{partition} already exists - skipping.')
                continue
            print('Processing', partition)
            cut_set = CutSet.from_manifests(
                recordings=manifests['recordings'],
                supervisions=manifests['supervisions'])
            if 'train' in partition:
                cut_set = cut_set + cut_set.perturb_speed(
                    0.9) + cut_set.perturb_speed(1.1)
            cut_set = cut_set.compute_and_store_features(
                extractor=Fbank(),
                executor=ex,
                storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}'))
            aishell_manifests[partition]['cuts'] = cut_set
            cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
Example #29
0
@pytest.fixture
def libri_cut_set():
    cuts = CutSet.from_json("test/fixtures/libri/cuts.json")
    return CutSet.from_cuts([
        cuts[0],
        cuts[0].with_id("copy-1"),
        cuts[0].with_id("copy-2"),
        cuts[0].append(cuts[0]),
    ])


@pytest.mark.parametrize(
    "batchio",
    [AudioSamples, PrecomputedFeatures,
     partial(OnTheFlyFeatures, Fbank())])
@pytest.mark.parametrize("num_workers", [0, 1, 2])
@pytest.mark.parametrize("executor_type",
                         [ThreadPoolExecutor, ProcessPoolExecutor])
def test_batch_io(libri_cut_set, batchio, num_workers, executor_type):
    # does not fail / hang / etc.
    read_fn = batchio(num_workers=num_workers, executor_type=executor_type)
    read_fn(libri_cut_set)


def test_audio_samples_with_custom_field(libri_cut_set):
    batchio = AudioSamples()

    def attach_custom_audio(cut):
        """Simulate adding an additional custom recording"""
        cut.my_favorite_song = cut.recording.perturb_volume(factor=1.1)
def main():
    args = get_parser().parse_args()

    model_type = args.model_type
    start_epoch = args.start_epoch
    num_epochs = args.num_epochs
    max_duration = args.max_duration
    accum_grad = args.accum_grad
    att_rate = args.att_rate

    fix_random_seed(42)

    exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa')
    setup_logger('{}/log/log-train'.format(exp_dir))
    tb_writer = SummaryWriter(
        log_dir=f'{exp_dir}/tensorboard') if args.tensorboard else None

    # load L, G, symbol_table
    lang_dir = Path('data/lang_nosp')
    phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt')
    word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt')

    logging.info("Loading L.fst")
    if (lang_dir / 'Linv.pt').exists():
        L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt'))
    else:
        with open(lang_dir / 'L.fst.txt') as f:
            L = k2.Fsa.from_openfst(f.read(), acceptor=False)
            L_inv = k2.arc_sort(L.invert_())
            torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt')

    graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv,
                                              phones=phone_symbol_table,
                                              words=word_symbol_table)
    phone_ids = get_phone_symbols(phone_symbol_table)

    # load dataset
    feature_dir = Path('exp/data')
    logging.info("About to get train cuts")
    cuts_train = load_manifest(feature_dir / 'cuts_train-clean-100.json.gz')
    if args.full_libri:
        cuts_train = (
            cuts_train +
            load_manifest(feature_dir / 'cuts_train-clean-360.json.gz') +
            load_manifest(feature_dir / 'cuts_train-other-500.json.gz'))
    logging.info("About to get dev cuts")
    cuts_dev = (load_manifest(feature_dir / 'cuts_dev-clean.json.gz') +
                load_manifest(feature_dir / 'cuts_dev-other.json.gz'))
    logging.info("About to get Musan cuts")
    cuts_musan = load_manifest(feature_dir / 'cuts_musan.json.gz')

    logging.info("About to create train dataset")
    transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
    if args.concatenate_cuts:
        logging.info(
            f'Using cut concatenation with duration factor {args.duration_factor} and gap {args.gap}.'
        )
        # Cut concatenation should be the first transform in the list,
        # so that if we e.g. mix noise in, it will fill the gaps between different utterances.
        transforms = [
            CutConcatenate(duration_factor=args.duration_factor, gap=args.gap)
        ] + transforms
    train = K2SpeechRecognitionDataset(cuts_train,
                                       cut_transforms=transforms,
                                       input_transforms=[
                                           SpecAugment(num_frame_masks=2,
                                                       features_mask_size=27,
                                                       num_feature_masks=2,
                                                       frames_mask_size=100)
                                       ])

    if args.on_the_fly_feats:
        # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage.
        # # Add on-the-fly speed perturbation; since originally it would have increased epoch
        # # size by 3, we will apply prob 2/3 and use 3x more epochs.
        # # Speed perturbation probably should come first before concatenation,
        # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized)
        # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms
        # Drop feats to be on the safe side.
        cuts_train = cuts_train.drop_features()
        from lhotse.features.fbank import FbankConfig
        train = K2SpeechRecognitionDataset(
            cuts=cuts_train,
            cut_transforms=transforms,
            input_strategy=OnTheFlyFeatures(Fbank(
                FbankConfig(num_mel_bins=80))),
            input_transforms=[
                SpecAugment(num_frame_masks=2,
                            features_mask_size=27,
                            num_feature_masks=2,
                            frames_mask_size=100)
            ])

    if args.bucketing_sampler:
        logging.info('Using BucketingSampler.')
        train_sampler = BucketingSampler(cuts_train,
                                         max_duration=max_duration,
                                         shuffle=True,
                                         num_buckets=args.num_buckets)
    else:
        logging.info('Using SingleCutSampler.')
        train_sampler = SingleCutSampler(
            cuts_train,
            max_duration=max_duration,
            shuffle=True,
        )
    logging.info("About to create train dataloader")
    train_dl = torch.utils.data.DataLoader(
        train,
        sampler=train_sampler,
        batch_size=None,
        num_workers=4,
    )

    logging.info("About to create dev dataset")
    if args.on_the_fly_feats:
        cuts_dev = cuts_dev.drop_features()
        validate = K2SpeechRecognitionDataset(
            cuts_dev.drop_features(),
            input_strategy=OnTheFlyFeatures(Fbank(
                FbankConfig(num_mel_bins=80))))
    else:
        validate = K2SpeechRecognitionDataset(cuts_dev)
    valid_sampler = SingleCutSampler(
        cuts_dev,
        max_duration=max_duration,
    )
    logging.info("About to create dev dataloader")
    valid_dl = torch.utils.data.DataLoader(validate,
                                           sampler=valid_sampler,
                                           batch_size=None,
                                           num_workers=1)

    if not torch.cuda.is_available():
        logging.error('No GPU detected!')
        sys.exit(-1)

    logging.info("About to create model")
    device_id = 0
    device = torch.device('cuda', device_id)

    if att_rate != 0.0:
        num_decoder_layers = 6
    else:
        num_decoder_layers = 0

    if model_type == "transformer":
        model = Transformer(
            num_features=80,
            nhead=args.nhead,
            d_model=args.attention_dim,
            num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
            subsampling_factor=4,
            num_decoder_layers=num_decoder_layers)
    else:
        model = Conformer(
            num_features=80,
            nhead=args.nhead,
            d_model=args.attention_dim,
            num_classes=len(phone_ids) + 1,  # +1 for the blank symbol
            subsampling_factor=4,
            num_decoder_layers=num_decoder_layers)

    model.to(device)
    describe(model)

    optimizer = Noam(model.parameters(),
                     model_size=args.attention_dim,
                     factor=1.0,
                     warm_step=args.warm_step)

    best_objf = np.inf
    best_valid_objf = np.inf
    best_epoch = start_epoch
    best_model_path = os.path.join(exp_dir, 'best_model.pt')
    best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info')
    global_batch_idx_train = 0  # for logging only

    if start_epoch > 0:
        model_path = os.path.join(exp_dir,
                                  'epoch-{}.pt'.format(start_epoch - 1))
        ckpt = load_checkpoint(filename=model_path,
                               model=model,
                               optimizer=optimizer)
        best_objf = ckpt['objf']
        best_valid_objf = ckpt['valid_objf']
        global_batch_idx_train = ckpt['global_batch_idx_train']
        logging.info(
            f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}"
        )

    for epoch in range(start_epoch, num_epochs):
        train_sampler.set_epoch(epoch)
        curr_learning_rate = optimizer._rate
        if tb_writer is not None:
            tb_writer.add_scalar('train/learning_rate', curr_learning_rate,
                                 global_batch_idx_train)
            tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train)

        logging.info('epoch {}, learning rate {}'.format(
            epoch, curr_learning_rate))
        objf, valid_objf, global_batch_idx_train = train_one_epoch(
            dataloader=train_dl,
            valid_dataloader=valid_dl,
            model=model,
            device=device,
            graph_compiler=graph_compiler,
            optimizer=optimizer,
            accum_grad=accum_grad,
            att_rate=att_rate,
            current_epoch=epoch,
            tb_writer=tb_writer,
            num_epochs=num_epochs,
            global_batch_idx_train=global_batch_idx_train,
        )
        # the lower, the better
        if valid_objf < best_valid_objf:
            best_valid_objf = valid_objf
            best_objf = objf
            best_epoch = epoch
            save_checkpoint(filename=best_model_path,
                            optimizer=None,
                            scheduler=None,
                            model=model,
                            epoch=epoch,
                            learning_rate=curr_learning_rate,
                            objf=objf,
                            valid_objf=valid_objf,
                            global_batch_idx_train=global_batch_idx_train)
            save_training_info(filename=best_epoch_info_filename,
                               model_path=best_model_path,
                               current_epoch=epoch,
                               learning_rate=curr_learning_rate,
                               objf=objf,
                               best_objf=best_objf,
                               valid_objf=valid_objf,
                               best_valid_objf=best_valid_objf,
                               best_epoch=best_epoch)

        # we always save the model for every epoch
        model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch))
        save_checkpoint(filename=model_path,
                        optimizer=optimizer,
                        scheduler=None,
                        model=model,
                        epoch=epoch,
                        learning_rate=curr_learning_rate,
                        objf=objf,
                        valid_objf=valid_objf,
                        global_batch_idx_train=global_batch_idx_train)
        epoch_info_filename = os.path.join(exp_dir,
                                           'epoch-{}-info'.format(epoch))
        save_training_info(filename=epoch_info_filename,
                           model_path=model_path,
                           current_epoch=epoch,
                           learning_rate=curr_learning_rate,
                           objf=objf,
                           best_objf=best_objf,
                           valid_objf=valid_objf,
                           best_valid_objf=best_valid_objf,
                           best_epoch=best_epoch)

    logging.warning('Done')