Exemple #1
0
    def test_delta(self):
        audio_config = {
            "feat_type": "fbank",
            "feat_dim": 40,
            "dither": 0.0,
            "apply_cmvn": True,
            "frame_length": 25,
            "frame_shift": 10,
            "delta_order": 1,
            "delta_window_size": 2,
        }

        transform, d = audio.create_transform(audio_config)
        y = transform(self.filepath)

        self.assertEqual(list(y.shape), [392, d])

        audio_config = {
            "feat_type": "fbank",
            "feat_dim": 40,
            "dither": 0.0,
            "apply_cmvn": True,
            "frame_length": 25,
            "frame_shift": 10,
            "delta_order": 0,
        }

        transform, d = audio.create_transform(audio_config)
        y_no_delta = transform(self.filepath)

        np.testing.assert_allclose(y[:, :40], y_no_delta, rtol=1e-5, atol=1e-5)
Exemple #2
0
def load_noisy_dataset(job, input, n_jobs, use_gpu, pin_memory, ascending,
                       corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, tr_loader_bs, mode = create_noisy_dataset(job, input, tokenizer,
                                                      ascending, **corpus)
    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode=mode)
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)

    return tr_set, feat_dim, tokenizer.vocab_size, tokenizer
Exemple #3
0
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset(
        tokenizer, ascending, **corpus)
    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform, mode=mode)
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform, mode='test')
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr,
                        num_workers=n_jobs, pin_memory=use_gpu)
    dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv,
                        num_workers=n_jobs, pin_memory=pin_memory)
    # Messages to show
    data_msg.append('I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'
                    .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size))

    return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
Exemple #4
0
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                       text):
    ''' Prepare dataloader for testing'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    from corpus.dlhlp import DlhlpDataset as Dataset
    dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1)
    # Collect function
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode='test')
    # Create data loader
    dv_set = DataLoader(dv_set,
                        batch_size=1,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=n_jobs,
                        pin_memory=pin_memory)

    msg = 'I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format(
        audio['feat_type'], feat_dim, tokenizer.token_type,
        tokenizer.vocab_size)
    return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
Exemple #5
0
def load_test_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                      text):
    ''' Prepare dataloader for training/validation'''
    def _create_dataset(tokenizer,
                        ascending,
                        name,
                        path,
                        bucketing,
                        batch_size,
                        train_split=None,
                        dev_split=None,
                        test_split=None):
        ''' Interface for creating all kinds of dataset'''

        # Recognize corpus
        if name.lower() == "librispeech":
            from corpus.librispeech import LibriDataset as Dataset
        elif name.lower() == "dlhlp":
            from corpus.dlhlp import DlhlpDataset as Dataset
        elif name.lower() == 'external':
            from corpus.external import ExternalDataset as Dataset
        else:
            raise NotImplementedError

        # Testing model
        mode = 'test'
        # Do not use bucketing for test set
        tt_set = Dataset(path, test_split, tokenizer, 1)
        # Messages to show
        return tt_set, batch_size, mode, []

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    dv_set, dv_loader_bs, mode, data_msg = _create_dataset(
        tokenizer, ascending, **corpus)
    # Collect function
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode='test')
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    # Create data loader
    dv_set = DataLoader(dv_set,
                        batch_size=dv_loader_bs,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=n_jobs,
                        pin_memory=pin_memory)
    # Messages to show
    data_msg.append(
        'I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'
        .format(audio['feat_type'], feat_dim, tokenizer.token_type,
                tokenizer.vocab_size))

    return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())  ## from src.audio import create_transform
    ''' Returns a pytorch seq module dealing audio transform '''
    # Text tokenizer
    tokenizer = load_text_encoder(**text)                       ## from src.text import load_text_encoder
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)

        '''
Exemple #7
0
    def test_filter_bank(self):
        audio_config = {
            "feat_type": "fbank",
            "feat_dim": 40,
            "apply_cmvn": False,
            "frame_length": 25,
            "frame_shift": 10,
        }

        transform, d = audio.create_transform(audio_config)
        y = transform(self.filepath)
        self.assertEqual(list(y.shape), [392, d])
Exemple #8
0
def prepare_data(n_jobs, dev_n_jobs, use_gpu, pin_memory, dataset, audio):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, audio_dim = create_transform(audio.copy())
    data_msg = audio_transform.create_msg()

    # Create dataset
    tr_set, dv_set, tt_set, batch_size, msg, collect_fn, audio_max_frames =\
        create_dataset( **dataset)
    data_msg += msg

    # Collect function
    collect_tr = partial(collect_fn,
                         audio_max_frames=audio_max_frames,
                         audio_transform=audio_transform,
                         mode='train')
    collect_dv = partial(collect_fn,
                         audio_max_frames=audio_max_frames,
                         audio_transform=audio_transform,
                         mode='dev')
    # Create data loader
    tr_set = DataLoader(tr_set,
                        batch_size=batch_size,
                        shuffle=True,
                        drop_last=True,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)
    dv_set = DataLoader(dv_set,
                        batch_size=batch_size,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=dev_n_jobs,
                        pin_memory=pin_memory)

    # Prepare testset if needed
    if tt_set is not None:
        collect_tt = partial(collect_fn,
                             audio_max_frames=audio_max_frames,
                             audio_transform=audio_transform,
                             mode='test')
        tt_set = DataLoader(tt_set,
                            batch_size=batch_size,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=collect_tt,
                            num_workers=dev_n_jobs,
                            pin_memory=pin_memory)

    return tr_set, dv_set, tt_set, audio_dim, data_msg
Exemple #9
0
    def test_mfcc(self):
        self.skipTest(
            "torchaudio.compliance.kaldi.mfcc is not in torchaudio==0.3.0")
        audio_config = {
            "feat_type": "mfcc",
            "feat_dim": 13,
            "apply_cmvn": False,
            "frame_length": 25,
            "frame_shift": 10,
        }

        transform, d = audio.create_transform(audio_config)
        y = transform(self.filepath)
        self.assertEqual(list(y.shape), [392, d])
Exemple #10
0
    def test_delta_delta(self):
        audio_config = {
            "feat_type": "fbank",
            "feat_dim": 40,
            "apply_cmvn": True,
            "frame_length": 25,
            "frame_shift": 10,
            "delta_order": 2,
            "delta_window_size": 2,
        }

        transform, d = audio.create_transform(audio_config)
        y = transform(self.filepath)

        self.assertEqual(list(y.shape), [392, d])
Exemple #11
0
    def test_cmvn(self):
        audio_config = {
            "feat_type": "fbank",
            "feat_dim": 40,
            "apply_cmvn": True,
            "frame_length": 25,
            "frame_shift": 10,
        }

        transform, d = audio.create_transform(audio_config)
        y = transform(self.filepath)

        self.assertEqual(list(y.shape), [392, d])
        np.testing.assert_allclose(y.mean(0), 0.0, rtol=1e-6, atol=5e-5)
        np.testing.assert_allclose(y.std(0), 1.0, rtol=1e-6, atol=1e-6)
Exemple #12
0
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''
    # Audio feature extractor
    '''convert to mel-spectrogram'''
    audio_transform_tr, feat_dim = create_transform(audio.copy(), 'train')
    audio_transform_dv, feat_dim = create_transform(audio.copy(), 'dev')
    '''add augment function here'''
    #print(audio_transform_tr)
    #augment = Augment()
    #audio_transform_tr = Augment(audio_transform_tr)
    #### not augment dvset

    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset(
        tokenizer, ascending, **corpus)

    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform_tr,
                         mode=mode)
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform_dv,
                         mode='test')

    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    #print(tr_loader_bs)
    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)

    if type(dv_set) is list:
        _tmp_set = []
        for ds in dv_set:
            _tmp_set.append(
                DataLoader(ds,
                           batch_size=dv_loader_bs,
                           shuffle=False,
                           drop_last=False,
                           collate_fn=collect_dv,
                           num_workers=n_jobs,
                           pin_memory=pin_memory))
        dv_set = _tmp_set
    else:
        dv_set = DataLoader(dv_set,
                            batch_size=dv_loader_bs,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=collect_dv,
                            num_workers=n_jobs,
                            pin_memory=pin_memory)

    # Messages to show
    data_msg.append('I/O spec.  | Audio Feature = {}\t| Feature Dim = {}\t| Token Type = {}\t| Vocab Size = {}'\
                    .format(audio['feat_type'],feat_dim,tokenizer.token_type,tokenizer.vocab_size))
    return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg