def test_extract_and_store_features_from_cut_set(cut_set, executor, mix_eagerly): extractor = Fbank() with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: with executor() if executor is not None else no_executor() as ex: cut_set_with_feats = cut_set.compute_and_store_features( extractor=extractor, storage=storage, mix_eagerly=mix_eagerly, executor=ex) # The same number of cuts assert len(cut_set_with_feats) == 2 for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats): # The ID is retained assert orig_cut.id == feat_cut.id # Features were attached assert feat_cut.has_features # Recording is retained unless mixing a MixedCut eagerly should_have_recording = not (mix_eagerly and isinstance(orig_cut, MixedCut)) assert feat_cut.has_recording == should_have_recording cuts = list(cut_set_with_feats) arr = cuts[0].load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate) arr = cuts[1].load_features() assert arr.shape[0] == 300 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
def test_extract_and_store_features(cut): extractor = Fbank() with TemporaryDirectory() as tmpdir: cut_with_feats = cut.compute_and_store_features(extractor=extractor, output_dir=tmpdir) arr = cut_with_feats.load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
def test_extract_and_store_features_from_cut_set(cut_set, executor, num_jobs, storage_type, mix_eagerly): extractor = Fbank() with TemporaryDirectory() as tmpdir: cut_set_with_feats = cut_set.compute_and_store_features( extractor=extractor, storage_path=tmpdir, num_jobs=num_jobs, mix_eagerly=mix_eagerly, executor=executor() if executor else None, storage_type=storage_type).sort_by_duration( ) # sort by duration to ensure the same order of cuts # The same number of cuts assert len(cut_set_with_feats) == 2 for orig_cut, feat_cut in zip(cut_set, cut_set_with_feats): # The ID is retained assert orig_cut.id == feat_cut.id # Features were attached assert feat_cut.has_features # Recording is retained unless mixing a MixedCut eagerly should_have_recording = not (mix_eagerly and isinstance(orig_cut, MixedCut)) assert feat_cut.has_recording == should_have_recording cuts = list(cut_set_with_feats) arr = cuts[0].load_features() assert arr.shape[0] == 300 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate) arr = cuts[1].load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cuts[0].sampling_rate)
def main(): corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Aishell data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('aishell manifest preparation:') aishell_manifests = prepare_aishell(corpus_dir=corpus_dir, output_dir=output_dir) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir='/export/corpora5/JHU/musan', output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_{partition}', num_jobs=num_jobs if ex is not None else 80, executor=ex, storage_type=LilcomHdf5Writer) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=Fbank(), storage_path=f'{output_dir}/feats_musan', num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def test_extract_and_store_features(cut): extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: cut_with_feats = cut.compute_and_store_features(extractor=extractor, storage=storage) arr = cut_with_feats.load_features() assert arr.shape[0] == 100 assert arr.shape[1] == extractor.feature_dim(cut.sampling_rate)
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly): mixed_cut = cut.append(cut) extractor = Fbank(FbankConfig(sampling_rate=8000)) with TemporaryDirectory() as tmpdir, LilcomFilesWriter(tmpdir) as storage: cut_with_feats = mixed_cut.compute_and_store_features( extractor=extractor, storage=storage, mix_eagerly=mix_eagerly) arr = cut_with_feats.load_features() assert arr.shape[0] == 200 assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def test_extract_and_store_features_from_mixed_cut(cut, mix_eagerly): mixed_cut = cut.append(cut) extractor = Fbank() with TemporaryDirectory() as tmpdir: cut_with_feats = mixed_cut.compute_and_store_features( extractor=extractor, output_dir=tmpdir, mix_eagerly=mix_eagerly ) arr = cut_with_feats.load_features() assert arr.shape[0] == 200 assert arr.shape[1] == extractor.feature_dim(mixed_cut.sampling_rate)
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing( k2_cut_set, ): dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures(extractor=Fbank(), )) rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures( extractor=Fbank(), # Use p=1.0 to ensure that smoothing is applied in this test. wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)], )) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch = dataset[cut_ids] rs_batch = rs_dataset[cut_ids] # Additive noise should cause the energies to go up assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
def test_k2_speech_recognition_on_the_fly_feature_extraction( k2_cut_set, use_batch_extract, fault_tolerant): precomputed_dataset = K2SpeechRecognitionDataset() on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=40)), use_batch_extract=use_batch_extract, fault_tolerant=fault_tolerant, )) sampler = SimpleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc["inputs"]) norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"]) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc["supervisions"]["start_frame"] == batch_otf["supervisions"]["start_frame"]).all() assert (batch_pc["supervisions"]["num_frames"] == batch_otf["supervisions"]["num_frames"]).all()
def test_cut_set_batch_feature_extraction_resume(cut_set, overwrite): # This test checks that we can keep writing to the same file # and the previously written results are not lost. # Since we don't have an easy way to interrupt the execution in a test, # we just write another CutSet to the same file. # The effect is the same. extractor = Fbank() cut_set = cut_set.resample(16000) subsets = cut_set.split(num_splits=2) processed = [] with NamedTemporaryFile() as feat_f, NamedTemporaryFile( suffix=".jsonl.gz") as manifest_f: for cuts in subsets: processed.append( cuts.compute_and_store_features_batch( extractor=extractor, storage_path=feat_f.name, manifest_path=manifest_f.name, num_workers=0, overwrite=overwrite, )) feat_f.flush() manifest_f.flush() merged = load_manifest(manifest_f.name) if overwrite: assert list(merged.ids) == list(subsets[-1].ids) else: assert list(merged.ids) == list(cut_set.ids) validate(merged, read_data=True)
def test_dataloaders(self) -> Union[DataLoader, List[DataLoader]]: cuts = self.test_cuts() is_list = isinstance(cuts, list) test_loaders = [] if not is_list: cuts = [cuts] for cuts_test in cuts: logging.debug("About to create test dataset") test = K2SpeechRecognitionDataset( input_strategy=( OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) if self.args.on_the_fly_feats else PrecomputedFeatures()), return_cuts=True, ) sampler = SingleCutSampler(cuts_test, max_duration=self.args.max_duration) logging.debug("About to create test dataloader") test_dl = DataLoader(test, batch_size=None, sampler=sampler, num_workers=1) test_loaders.append(test_dl) if is_list: return test_loaders else: return test_loaders[0]
def valid_dataloaders(self) -> DataLoader: logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() logging.info("About to create dev dataset") if self.args.on_the_fly_feats: cuts_valid = cuts_valid.drop_features() validate = K2SpeechRecognitionDataset( cuts_valid.drop_features(), input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=80)))) else: validate = K2SpeechRecognitionDataset(cuts_valid) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, ) logging.info("About to create dev dataloader") valid_dl = DataLoader( validate, sampler=valid_sampler, batch_size=None, num_workers=2, persistent_workers=True, ) return valid_dl
def main(): args = get_parser().parse_args() dataset_parts = ('dev', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus(Path('/mnt/corpora/MLS_French')) musan_dir = locate_corpus(Path('/mnt/corpora/musan')) output_dir = Path('exp/data') print('mls manifest preparation:') mls_manifests = prepare_mls(corpus_dir=corpus_dir, output_dir=output_dir, opus=False, num_jobs=args.num_jobs) print('Musan manifest preparation:') musan_cuts_path = output_dir / 'cuts_musan.json.gz' musan_manifests = prepare_musan(corpus_dir=musan_dir, output_dir=output_dir, parts=('music', 'speech', 'noise')) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in mls_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) mls_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz') # Now onto Musan if not musan_cuts_path.is_file(): print('Extracting features for Musan') # create chunks of Musan with duration 5 - 10 seconds musan_cuts = CutSet.from_manifests(recordings=combine( part['recordings'] for part in musan_manifests.values())).cut_into_windows( 10.0).filter( lambda c: c.duration > 5).compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_musan', num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer) musan_cuts.to_json(musan_cuts_path)
def _with_features( self, cut: MonoCut, frame_shift: Seconds, sampling_rate: int ) -> MonoCut: d = TemporaryDirectory() self.dirs.append(d) extractor = Fbank( config=FbankConfig(sampling_rate=sampling_rate, frame_shift=frame_shift) ) with LilcomHdf5Writer(d.name) as storage: return cut.compute_and_store_features(extractor, storage=storage)
def test_wav_augment_with_executor(self, exec_type): cut = self.with_cut(sampling_rate=16000, num_samples=16000) with TemporaryDirectory() as d, \ exec_type(max_workers=4) as ex: cut_set = CutSet.from_cuts( cut.with_id(str(i)) for i in range(100)).perturb_speed( 1.1 ) # perturb_speed uses torchaudio SoX effect that could hang # Just test that it runs and does not hang. cut_set_feats = cut_set.compute_and_store_features( extractor=Fbank(), storage_path=d, executor=ex)
def test_wav_augment_with_executor(self, exec_type): cut = self.with_cut(sampling_rate=16000, num_samples=16000) with TemporaryDirectory() as d, \ LilcomFilesWriter(storage_path=d) as storage, \ exec_type(max_workers=4) as ex: cut_set = CutSet.from_cuts(cut.with_id(str(i)) for i in range(100)) # Just test that it runs and does not hang. cut_set_feats = cut_set.compute_and_store_features( extractor=Fbank(), storage=storage, augment_fn=SoxEffectTransform(speed(16000)), executor=ex)
def test_cut_set_batch_feature_extraction_manifest_path( cut_set, suffix, exception_expectation): extractor = Fbank() cut_set = cut_set.resample(16000) with NamedTemporaryFile() as feat_f, NamedTemporaryFile( suffix=suffix) as manifest_f: with exception_expectation: cut_set_with_feats = cut_set.compute_and_store_features_batch( extractor=extractor, storage_path=feat_f.name, manifest_path=manifest_f.name, num_workers=0, ) validate(cut_set_with_feats, read_data=True)
def test_on_the_fly_feats_return_audio(cut_set): from lhotse.dataset import OnTheFlyFeatures extractor = OnTheFlyFeatures(extractor=Fbank(), return_audio=True) cut_set = cut_set.resample(16000) feats, feat_lens, audios, audio_lens = extractor(cut_set) assert isinstance(feats, torch.Tensor) assert isinstance(feat_lens, torch.Tensor) assert isinstance(audios, torch.Tensor) assert isinstance(audio_lens, torch.Tensor) assert feats.shape == (2, 300, 80) assert feat_lens.shape == (2, ) assert audios.shape == (2, 48000) assert audio_lens.shape == (2, )
def main(): dataset_parts = ('dev-clean', 'test-clean', 'train-clean-100') print("Parts we will prepare: ", dataset_parts) corpus_dirs = [ Path('/export/corpora5/LibriSpeech'), Path( '/home/storage04/zhuangweiji/data/open-source-data/librispeech/LibriSpeech' ) ] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Librispeech data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('Manifest preparation:') librispeech_manifests = prepare_librispeech(corpus_dir=corpus_dir, dataset_parts=dataset_parts, output_dir=output_dir, num_jobs=num_jobs) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in librispeech_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), executor=ex, storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}')) librispeech_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def __init__( self, lang_dir: Pathlike, scripted_model_path: Optional[Pathlike] = None, model_dir: Optional[Pathlike] = None, average_epochs: Sequence[int] = (7, 8, 9), device: torch.device = 'cpu', sampling_rate: int = 16000, ): if isinstance(device, str): self.device = torch.device(device) self.sampling_rate = sampling_rate self.extractor = Fbank(FbankConfig(num_mel_bins=80)) self.lexicon = Lexicon(lang_dir) phone_ids = self.lexicon.phone_symbols() self.P = create_bigram_phone_lm(phone_ids) if model_dir is not None: # Read model from regular checkpoints, assume it's a Conformer self.model = Conformer(num_features=80, num_classes=len(phone_ids) + 1, num_decoder_layers=0) self.P.scores = torch.zeros_like(self.P.scores) self.model.P_scores = torch.nn.Parameter(self.P.scores.clone(), requires_grad=False) average_checkpoint(filenames=[ model_dir / f'epoch-{n}.pt' for n in average_epochs ], model=self.model) elif scripted_model_path is not None: # Read model from a serialized TorchScript module, no assumptions needed self.model = torch.jit.load(scripted_model_path) else: raise ValueError( "One of scripted_model_path or model_dir needs to be provided." ) # Freeze the params by default. for p in self.model.parameters(): p.requires_grad_(False) self.compiler = MmiTrainingGraphCompiler(lexicon=self.lexicon, device=self.device) self.HLG = k2.Fsa.from_dict(torch.load(lang_dir / 'HLG.pt')).to( self.device)
def valid_dataloaders(self) -> DataLoader: self.validate_args() logging.info("About to get dev cuts") cuts_valid = self.valid_cuts() transforms = [] if self.args.concatenate_cuts: transforms = [ CutConcatenate(duration_factor=self.args.duration_factor, gap=self.args.gap) ] + transforms logging.info("About to create dev dataset") if self.args.on_the_fly_feats: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)), num_workers=8), return_cuts=self.args.return_cuts, ) else: validate = K2SpeechRecognitionDataset( cut_transforms=transforms, return_cuts=self.args.return_cuts, ) valid_sampler = SingleCutSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, ) logging.info("About to create dev dataloader") # valid_dl = DataLoader( # validate, # sampler=valid_sampler, # batch_size=None, # num_workers=8, # persistent_workers=True, # ) valid_dl = LhotseDataLoader( validate, sampler=valid_sampler, num_workers=2, ) return valid_dl
def test_padded_cut_num_frames_and_samples_are_consistent( sampling_rate, num_samples, padded_duration): with make_cut(sampling_rate, num_samples) as cut, \ TemporaryDirectory() as dir, \ LilcomFilesWriter(dir) as storage: cut = cut.compute_and_store_features(extractor=Fbank(), storage=storage) cut = cut.pad(padded_duration) feats = cut.load_features() samples = cut.load_audio() assert cut.has_features assert feats.shape[0] == cut.num_frames assert feats.shape[1] == cut.num_features assert cut.has_recording assert samples.shape[0] == 1 assert samples.shape[1] == cut.num_samples
def main(): args = get_parser().parse_args() dataset_parts = ('devtest', 'test', 'train') print("Parts we will prepare: ", dataset_parts) corpus_dir = locate_corpus( Path('/mnt/corpora/LDC2006S37/data'), ) output_dir = Path('exp/data') print('Heroico manifest preparation:') transcripts_dir = Path.joinpath( corpus_dir, 'transcripts' ) heroico_manifests = prepare_heroico( speech_dir=corpus_dir, transcript_dir=transcripts_dir, output_dir=output_dir, ) print('Feature extraction:') extractor = Fbank(FbankConfig(num_mel_bins=80, frame_shift=0.02)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in heroico_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions'] ) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f'{output_dir}/feats_{partition}', # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else 80, executor=ex, storage_type=LilcomHdf5Writer ) heroico_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
def main(): args = get_parser().parse_args() corpus_dir = locate_corpus( Path("/export/corpora5/AMI/amicorpus"), ) annotations_dir = Path("/export/c07/draj") download_ami(corpus_dir, annotations_dir=annotations_dir, mic="sdm") output_dir = Path("exp/data") print("AMI manifest preparation:") ami_manifests = prepare_ami( corpus_dir, annotations_dir=annotations_dir, output_dir=output_dir, mic="sdm", partition="full-corpus", max_pause=0, ) print("Feature extraction:") extractor = Fbank(FbankConfig(num_mel_bins=80)) with get_executor() as ex: # Initialize the executor only once. for partition, manifests in ami_manifests.items(): if (output_dir / f"cuts_{partition}.json.gz").is_file(): print(f"{partition} already exists - skipping.") continue print("Processing", partition) cut_set = CutSet.from_manifests( recordings=manifests["recordings"], supervisions=manifests["supervisions"], ).cut_into_windows(duration=5) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/feats_{partition}", # when an executor is specified, make more partitions num_jobs=args.num_jobs if ex is None else min(80, len(cut_set)), executor=ex, storage_type=LilcomHdf5Writer, ).pad(duration=5.0) cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set): precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set) on_the_fly_dataset = K2SpeechRecognitionDataset( k2_cut_set.drop_features(), input_strategy=OnTheFlyFeatures(Fbank()) ) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc['inputs']) norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs']) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all() assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
def test_mixed_cut_num_frames_example_1(): fbank = Fbank() with make_cut(sampling_rate=16000, num_samples=237920) as cut1, \ make_cut(sampling_rate=16000, num_samples=219600) as cut2, \ TemporaryDirectory() as d, \ LilcomFilesWriter(d) as storage: # These are two cuts of similar duration, concatenated together with 1 second of silence # in between, and padded to duration of 31.445. mixed: MixedCut = (cut1.compute_and_store_features( fbank, storage).pad(duration=cut1.duration + 1.0).append( cut2.compute_and_store_features(fbank, storage)).pad(duration=31.445)) assert mixed.duration == 31.445 # Padded correctly assert mixed.num_frames == 3145 # Round last 5 up assert sum( t.cut.num_frames for t in mixed.tracks ) == 3145 # Since the tracks do not overlap in this example, # The sum of individual cut num_frames should be equal to the total num_frames features = mixed.load_features() assert features.shape[ 0] == 3145 # Loaded features num frames matches the meta-data
def test_mixed_cut_num_frames_example_2(): fbank = Fbank() with make_cut(sampling_rate=16000, num_samples=252879) as cut1, \ make_cut(sampling_rate=16000, num_samples=185280) as cut2, \ make_cut(sampling_rate=16000, num_samples=204161) as cut3, \ TemporaryDirectory() as d, \ LilcomFilesWriter(d) as storage: # These are two cuts of similar duration, concatenated together with 1 second of silence # in between, and padded to duration of 31.445. mixed: MixedCut = (cut1.compute_and_store_features( fbank, storage).pad(duration=cut1.duration + 1.0).append( cut2.compute_and_store_features(fbank, storage))) mixed = (mixed.pad(duration=mixed.duration + 1.0).append( cut3.compute_and_store_features(fbank, storage))) assert mixed.duration == 42.145 # Padded correctly assert mixed.num_frames == 4215 # Round last 5 up # TODO(pzelasko): This assertion would not pass for now, as we're adding an extra frame during load_features. # assert sum(t.cut.num_frames for t in mixed.tracks) == 4215 # Since the tracks do not overlap in this example, # The sum of individual cut num_frames should be equal to the total num_frames features = mixed.load_features() assert features.shape[ 0] == 4215 # Loaded features num frames matches the meta-data
def main(): corpus_dirs = [Path('/mnt/cfs2/asr/database/AM/aishell')] corpus_dir = None for d in corpus_dirs: if os.path.exists(d): corpus_dir = d if corpus_dir is None: print( "Please create a place on your system to put the downloaded Aishell data " "and add it to `corpus_dirs`") sys.exit(1) output_dir = Path('exp/data') print('Manifest preparation:') aishell_manifests = prepare_aishell(corpus_dir=corpus_dir, output_dir=output_dir) print('Feature extraction:') with get_executor() as ex: # Initialize the executor only once. for partition, manifests in aishell_manifests.items(): if (output_dir / f'cuts_{partition}.json.gz').is_file(): print(f'{partition} already exists - skipping.') continue print('Processing', partition) cut_set = CutSet.from_manifests( recordings=manifests['recordings'], supervisions=manifests['supervisions']) if 'train' in partition: cut_set = cut_set + cut_set.perturb_speed( 0.9) + cut_set.perturb_speed(1.1) cut_set = cut_set.compute_and_store_features( extractor=Fbank(), executor=ex, storage=LilcomFilesWriter(f'{output_dir}/feats_{partition}')) aishell_manifests[partition]['cuts'] = cut_set cut_set.to_json(output_dir / f'cuts_{partition}.json.gz')
@pytest.fixture def libri_cut_set(): cuts = CutSet.from_json("test/fixtures/libri/cuts.json") return CutSet.from_cuts([ cuts[0], cuts[0].with_id("copy-1"), cuts[0].with_id("copy-2"), cuts[0].append(cuts[0]), ]) @pytest.mark.parametrize( "batchio", [AudioSamples, PrecomputedFeatures, partial(OnTheFlyFeatures, Fbank())]) @pytest.mark.parametrize("num_workers", [0, 1, 2]) @pytest.mark.parametrize("executor_type", [ThreadPoolExecutor, ProcessPoolExecutor]) def test_batch_io(libri_cut_set, batchio, num_workers, executor_type): # does not fail / hang / etc. read_fn = batchio(num_workers=num_workers, executor_type=executor_type) read_fn(libri_cut_set) def test_audio_samples_with_custom_field(libri_cut_set): batchio = AudioSamples() def attach_custom_audio(cut): """Simulate adding an additional custom recording""" cut.my_favorite_song = cut.recording.perturb_volume(factor=1.1)
def main(): args = get_parser().parse_args() model_type = args.model_type start_epoch = args.start_epoch num_epochs = args.num_epochs max_duration = args.max_duration accum_grad = args.accum_grad att_rate = args.att_rate fix_random_seed(42) exp_dir = Path('exp-' + model_type + '-noam-ctc-att-musan-sa') setup_logger('{}/log/log-train'.format(exp_dir)) tb_writer = SummaryWriter( log_dir=f'{exp_dir}/tensorboard') if args.tensorboard else None # load L, G, symbol_table lang_dir = Path('data/lang_nosp') phone_symbol_table = k2.SymbolTable.from_file(lang_dir / 'phones.txt') word_symbol_table = k2.SymbolTable.from_file(lang_dir / 'words.txt') logging.info("Loading L.fst") if (lang_dir / 'Linv.pt').exists(): L_inv = k2.Fsa.from_dict(torch.load(lang_dir / 'Linv.pt')) else: with open(lang_dir / 'L.fst.txt') as f: L = k2.Fsa.from_openfst(f.read(), acceptor=False) L_inv = k2.arc_sort(L.invert_()) torch.save(L_inv.as_dict(), lang_dir / 'Linv.pt') graph_compiler = CtcTrainingGraphCompiler(L_inv=L_inv, phones=phone_symbol_table, words=word_symbol_table) phone_ids = get_phone_symbols(phone_symbol_table) # load dataset feature_dir = Path('exp/data') logging.info("About to get train cuts") cuts_train = load_manifest(feature_dir / 'cuts_train-clean-100.json.gz') if args.full_libri: cuts_train = ( cuts_train + load_manifest(feature_dir / 'cuts_train-clean-360.json.gz') + load_manifest(feature_dir / 'cuts_train-other-500.json.gz')) logging.info("About to get dev cuts") cuts_dev = (load_manifest(feature_dir / 'cuts_dev-clean.json.gz') + load_manifest(feature_dir / 'cuts_dev-other.json.gz')) logging.info("About to get Musan cuts") cuts_musan = load_manifest(feature_dir / 'cuts_musan.json.gz') logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] if args.concatenate_cuts: logging.info( f'Using cut concatenation with duration factor {args.duration_factor} and gap {args.gap}.' ) # Cut concatenation should be the first transform in the list, # so that if we e.g. mix noise in, it will fill the gaps between different utterances. transforms = [ CutConcatenate(duration_factor=args.duration_factor, gap=args.gap) ] + transforms train = K2SpeechRecognitionDataset(cuts_train, cut_transforms=transforms, input_transforms=[ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ]) if args.on_the_fly_feats: # NOTE: the PerturbSpeed transform should be added only if we remove it from data prep stage. # # Add on-the-fly speed perturbation; since originally it would have increased epoch # # size by 3, we will apply prob 2/3 and use 3x more epochs. # # Speed perturbation probably should come first before concatenation, # # but in principle the transforms order doesn't have to be strict (e.g. could be randomized) # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2 / 3)] + transforms # Drop feats to be on the safe side. cuts_train = cuts_train.drop_features() from lhotse.features.fbank import FbankConfig train = K2SpeechRecognitionDataset( cuts=cuts_train, cut_transforms=transforms, input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80))), input_transforms=[ SpecAugment(num_frame_masks=2, features_mask_size=27, num_feature_masks=2, frames_mask_size=100) ]) if args.bucketing_sampler: logging.info('Using BucketingSampler.') train_sampler = BucketingSampler(cuts_train, max_duration=max_duration, shuffle=True, num_buckets=args.num_buckets) else: logging.info('Using SingleCutSampler.') train_sampler = SingleCutSampler( cuts_train, max_duration=max_duration, shuffle=True, ) logging.info("About to create train dataloader") train_dl = torch.utils.data.DataLoader( train, sampler=train_sampler, batch_size=None, num_workers=4, ) logging.info("About to create dev dataset") if args.on_the_fly_feats: cuts_dev = cuts_dev.drop_features() validate = K2SpeechRecognitionDataset( cuts_dev.drop_features(), input_strategy=OnTheFlyFeatures(Fbank( FbankConfig(num_mel_bins=80)))) else: validate = K2SpeechRecognitionDataset(cuts_dev) valid_sampler = SingleCutSampler( cuts_dev, max_duration=max_duration, ) logging.info("About to create dev dataloader") valid_dl = torch.utils.data.DataLoader(validate, sampler=valid_sampler, batch_size=None, num_workers=1) if not torch.cuda.is_available(): logging.error('No GPU detected!') sys.exit(-1) logging.info("About to create model") device_id = 0 device = torch.device('cuda', device_id) if att_rate != 0.0: num_decoder_layers = 6 else: num_decoder_layers = 0 if model_type == "transformer": model = Transformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) else: model = Conformer( num_features=80, nhead=args.nhead, d_model=args.attention_dim, num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers) model.to(device) describe(model) optimizer = Noam(model.parameters(), model_size=args.attention_dim, factor=1.0, warm_step=args.warm_step) best_objf = np.inf best_valid_objf = np.inf best_epoch = start_epoch best_model_path = os.path.join(exp_dir, 'best_model.pt') best_epoch_info_filename = os.path.join(exp_dir, 'best-epoch-info') global_batch_idx_train = 0 # for logging only if start_epoch > 0: model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(start_epoch - 1)) ckpt = load_checkpoint(filename=model_path, model=model, optimizer=optimizer) best_objf = ckpt['objf'] best_valid_objf = ckpt['valid_objf'] global_batch_idx_train = ckpt['global_batch_idx_train'] logging.info( f"epoch = {ckpt['epoch']}, objf = {best_objf}, valid_objf = {best_valid_objf}" ) for epoch in range(start_epoch, num_epochs): train_sampler.set_epoch(epoch) curr_learning_rate = optimizer._rate if tb_writer is not None: tb_writer.add_scalar('train/learning_rate', curr_learning_rate, global_batch_idx_train) tb_writer.add_scalar('train/epoch', epoch, global_batch_idx_train) logging.info('epoch {}, learning rate {}'.format( epoch, curr_learning_rate)) objf, valid_objf, global_batch_idx_train = train_one_epoch( dataloader=train_dl, valid_dataloader=valid_dl, model=model, device=device, graph_compiler=graph_compiler, optimizer=optimizer, accum_grad=accum_grad, att_rate=att_rate, current_epoch=epoch, tb_writer=tb_writer, num_epochs=num_epochs, global_batch_idx_train=global_batch_idx_train, ) # the lower, the better if valid_objf < best_valid_objf: best_valid_objf = valid_objf best_objf = objf best_epoch = epoch save_checkpoint(filename=best_model_path, optimizer=None, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) save_training_info(filename=best_epoch_info_filename, model_path=best_model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) # we always save the model for every epoch model_path = os.path.join(exp_dir, 'epoch-{}.pt'.format(epoch)) save_checkpoint(filename=model_path, optimizer=optimizer, scheduler=None, model=model, epoch=epoch, learning_rate=curr_learning_rate, objf=objf, valid_objf=valid_objf, global_batch_idx_train=global_batch_idx_train) epoch_info_filename = os.path.join(exp_dir, 'epoch-{}-info'.format(epoch)) save_training_info(filename=epoch_info_filename, model_path=model_path, current_epoch=epoch, learning_rate=curr_learning_rate, objf=objf, best_objf=best_objf, valid_objf=valid_objf, best_valid_objf=best_valid_objf, best_epoch=best_epoch) logging.warning('Done')