def test_delta(self): audio_config = { "feat_type": "fbank", "feat_dim": 40, "dither": 0.0, "apply_cmvn": True, "frame_length": 25, "frame_shift": 10, "delta_order": 1, "delta_window_size": 2, } transform, d = audio.create_transform(audio_config) y = transform(self.filepath) self.assertEqual(list(y.shape), [392, d]) audio_config = { "feat_type": "fbank", "feat_dim": 40, "dither": 0.0, "apply_cmvn": True, "frame_length": 25, "frame_shift": 10, "delta_order": 0, } transform, d = audio.create_transform(audio_config) y_no_delta = transform(self.filepath) np.testing.assert_allclose(y[:, :40], y_no_delta, rtol=1e-5, atol=1e-5)
def load_noisy_dataset(job, input, n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, tr_loader_bs, mode = create_noisy_dataset(job, input, tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform, mode=mode) # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) return tr_set, feat_dim, tokenizer.vocab_size, tokenizer
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset( tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform, mode=mode) collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append('I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}' .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size)) return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for testing''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) from corpus.dlhlp import DlhlpDataset as Dataset dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1) # Collect function collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Create data loader dv_set = DataLoader(dv_set, batch_size=1, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) msg = 'I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format( audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size) return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
def load_test_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' def _create_dataset(tokenizer, ascending, name, path, bucketing, batch_size, train_split=None, dev_split=None, test_split=None): ''' Interface for creating all kinds of dataset''' # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriDataset as Dataset elif name.lower() == "dlhlp": from corpus.dlhlp import DlhlpDataset as Dataset elif name.lower() == 'external': from corpus.external import ExternalDataset as Dataset else: raise NotImplementedError # Testing model mode = 'test' # Do not use bucketing for test set tt_set = Dataset(path, test_split, tokenizer, 1) # Messages to show return tt_set, batch_size, mode, [] # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) dv_set, dv_loader_bs, mode, data_msg = _create_dataset( tokenizer, ascending, **corpus) # Collect function collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) # Create data loader dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append( 'I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}' .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size)) return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) ## from src.audio import create_transform ''' Returns a pytorch seq module dealing audio transform ''' # Text tokenizer tokenizer = load_text_encoder(**text) ## from src.text import load_text_encoder # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) '''
def test_filter_bank(self): audio_config = { "feat_type": "fbank", "feat_dim": 40, "apply_cmvn": False, "frame_length": 25, "frame_shift": 10, } transform, d = audio.create_transform(audio_config) y = transform(self.filepath) self.assertEqual(list(y.shape), [392, d])
def prepare_data(n_jobs, dev_n_jobs, use_gpu, pin_memory, dataset, audio): ''' Prepare dataloader for training/validation''' # Audio feature extractor audio_transform, audio_dim = create_transform(audio.copy()) data_msg = audio_transform.create_msg() # Create dataset tr_set, dv_set, tt_set, batch_size, msg, collect_fn, audio_max_frames =\ create_dataset( **dataset) data_msg += msg # Collect function collect_tr = partial(collect_fn, audio_max_frames=audio_max_frames, audio_transform=audio_transform, mode='train') collect_dv = partial(collect_fn, audio_max_frames=audio_max_frames, audio_transform=audio_transform, mode='dev') # Create data loader tr_set = DataLoader(tr_set, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) dv_set = DataLoader(dv_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=dev_n_jobs, pin_memory=pin_memory) # Prepare testset if needed if tt_set is not None: collect_tt = partial(collect_fn, audio_max_frames=audio_max_frames, audio_transform=audio_transform, mode='test') tt_set = DataLoader(tt_set, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collect_tt, num_workers=dev_n_jobs, pin_memory=pin_memory) return tr_set, dv_set, tt_set, audio_dim, data_msg
def test_mfcc(self): self.skipTest( "torchaudio.compliance.kaldi.mfcc is not in torchaudio==0.3.0") audio_config = { "feat_type": "mfcc", "feat_dim": 13, "apply_cmvn": False, "frame_length": 25, "frame_shift": 10, } transform, d = audio.create_transform(audio_config) y = transform(self.filepath) self.assertEqual(list(y.shape), [392, d])
def test_delta_delta(self): audio_config = { "feat_type": "fbank", "feat_dim": 40, "apply_cmvn": True, "frame_length": 25, "frame_shift": 10, "delta_order": 2, "delta_window_size": 2, } transform, d = audio.create_transform(audio_config) y = transform(self.filepath) self.assertEqual(list(y.shape), [392, d])
def test_cmvn(self): audio_config = { "feat_type": "fbank", "feat_dim": 40, "apply_cmvn": True, "frame_length": 25, "frame_shift": 10, } transform, d = audio.create_transform(audio_config) y = transform(self.filepath) self.assertEqual(list(y.shape), [392, d]) np.testing.assert_allclose(y.mean(0), 0.0, rtol=1e-6, atol=5e-5) np.testing.assert_allclose(y.std(0), 1.0, rtol=1e-6, atol=1e-6)
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for training/validation''' # Audio feature extractor '''convert to mel-spectrogram''' audio_transform_tr, feat_dim = create_transform(audio.copy(), 'train') audio_transform_dv, feat_dim = create_transform(audio.copy(), 'dev') '''add augment function here''' #print(audio_transform_tr) #augment = Augment() #audio_transform_tr = Augment(audio_transform_tr) #### not augment dvset # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset( tokenizer, ascending, **corpus) # Collect function collect_tr = partial(collect_audio_batch, audio_transform=audio_transform_tr, mode=mode) collect_dv = partial(collect_audio_batch, audio_transform=audio_transform_dv, mode='test') # Shuffle/drop applied to training set only shuffle = (mode == 'train' and not ascending) drop_last = shuffle # Create data loader #print(tr_loader_bs) tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr, num_workers=n_jobs, pin_memory=use_gpu) if type(dv_set) is list: _tmp_set = [] for ds in dv_set: _tmp_set.append( DataLoader(ds, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory)) dv_set = _tmp_set else: dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) # Messages to show data_msg.append('I/O spec. | Audio Feature = {}\t| Feature Dim = {}\t| Token Type = {}\t| Vocab Size = {}'\ .format(audio['feat_type'],feat_dim,tokenizer.token_type,tokenizer.vocab_size)) return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg