def load_textset(n_jobs, use_gpu, pin_memory, corpus, text):

    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, data_msg = create_textset(
        tokenizer, **corpus)
    collect_tr = partial(collect_text_batch, mode='train')
    collect_dv = partial(collect_text_batch, mode='dev')
    # Dataloader (Text data stored in RAM, no need num_workers)
    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=True,
                        drop_last=True,
                        collate_fn=collect_tr,
                        num_workers=0,
                        pin_memory=use_gpu)
    dv_set = DataLoader(dv_set,
                        batch_size=dv_loader_bs,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=0,
                        pin_memory=pin_memory)

    # Messages to show
    data_msg.append('I/O spec.  | Token type = {}\t| Vocab size = {}'.format(
        tokenizer.token_type, tokenizer.vocab_size))

    return tr_set, dv_set, tokenizer.vocab_size, tokenizer, data_msg
Exemple #2
0
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset(
        tokenizer, ascending, **corpus)
    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform, mode=mode)
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform, mode='test')
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    tr_set = DataLoader(tr_set, batch_size=tr_loader_bs, shuffle=shuffle, drop_last=drop_last, collate_fn=collect_tr,
                        num_workers=n_jobs, pin_memory=use_gpu)
    dv_set = DataLoader(dv_set, batch_size=dv_loader_bs, shuffle=False, drop_last=False, collate_fn=collect_dv,
                        num_workers=n_jobs, pin_memory=pin_memory)
    # Messages to show
    data_msg.append('I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'
                    .format(audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size))

    return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
Exemple #3
0
def load_noisy_dataset(job, input, n_jobs, use_gpu, pin_memory, ascending,
                       corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, tr_loader_bs, mode = create_noisy_dataset(job, input, tokenizer,
                                                      ascending, **corpus)
    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode=mode)
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)

    return tr_set, feat_dim, tokenizer.vocab_size, tokenizer
Exemple #4
0
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                       text):
    ''' Prepare dataloader for testing'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    from corpus.dlhlp import DlhlpDataset as Dataset
    dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1)
    # Collect function
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode='test')
    # Create data loader
    dv_set = DataLoader(dv_set,
                        batch_size=1,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=n_jobs,
                        pin_memory=pin_memory)

    msg = 'I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format(
        audio['feat_type'], feat_dim, tokenizer.token_type,
        tokenizer.vocab_size)
    return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
Exemple #5
0
def load_test_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                      text):
    ''' Prepare dataloader for training/validation'''
    def _create_dataset(tokenizer,
                        ascending,
                        name,
                        path,
                        bucketing,
                        batch_size,
                        train_split=None,
                        dev_split=None,
                        test_split=None):
        ''' Interface for creating all kinds of dataset'''

        # Recognize corpus
        if name.lower() == "librispeech":
            from corpus.librispeech import LibriDataset as Dataset
        elif name.lower() == "dlhlp":
            from corpus.dlhlp import DlhlpDataset as Dataset
        elif name.lower() == 'external':
            from corpus.external import ExternalDataset as Dataset
        else:
            raise NotImplementedError

        # Testing model
        mode = 'test'
        # Do not use bucketing for test set
        tt_set = Dataset(path, test_split, tokenizer, 1)
        # Messages to show
        return tt_set, batch_size, mode, []

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    dv_set, dv_loader_bs, mode, data_msg = _create_dataset(
        tokenizer, ascending, **corpus)
    # Collect function
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform,
                         mode='test')
    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    # Create data loader
    dv_set = DataLoader(dv_set,
                        batch_size=dv_loader_bs,
                        shuffle=False,
                        drop_last=False,
                        collate_fn=collect_dv,
                        num_workers=n_jobs,
                        pin_memory=pin_memory)
    # Messages to show
    data_msg.append(
        'I/O spec.  | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'
        .format(audio['feat_type'], feat_dim, tokenizer.token_type,
                tokenizer.vocab_size))

    return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
Exemple #6
0
    def transfer_with_mapping(self, ckpt, transfer_config, cur_tokenizer):
        '''
        Transfer ctc layer weight to new one by method =
            - "no":     not transfer
            - "ipa":    transfer by ipa ground truth
            - "mapping":transfer by mapping
        '''
        # Load src model weights
        device = list(self.encoder.parameters())[0].device
        #ckpt_path = transfer_config.pop('src_ckpt')
        #ckpt = torch.load(
        #    ckpt_path, map_location=device)

        old_weights = ckpt['model'].pop('ctc_layer.weight')
        old_bias = ckpt['model'].pop('ctc_layer.bias')
        self.encoder.load_state_dict({
            n[8:]: v
            for n, v in ckpt['model'].items() if n.startswith('encoder.')
        })
        #del ckpt

        # Transfer weights
        method = transfer_config.pop('method')
        mapping = transfer_config.pop('mapping', None)
        self.init_ctclayer()
        if method == 'no':
            pass
        elif method in ['ipa', 'mapping']:
            old_vocab2idx = load_text_encoder(**transfer_config)._vocab2idx
            if method == 'ipa':
                # target --> src
                mapping = {v: v for v in cur_tokenizer._vocab_list}
            else:
                with open(mapping, 'r') as f:
                    mapping = json.load(f)
                mapping = {
                    tar_v: src_v
                    for tar_v, src_v in mapping.items()
                    if tar_v in cur_tokenizer._vocab_list
                }

            for tar_v, src_v in mapping.items():
                tar_i = cur_tokenizer._vocab2idx[tar_v]
                src_i = old_vocab2idx.get(src_v, None)
                if src_i is not None:
                    self.ctc_layer.weight.data[tar_i].copy_(
                        old_weights.data[src_i])
                    self.ctc_layer.bias.data[tar_i].copy_(old_bias.data[src_i])

        else:
            raise ValueError(f'Not supporting method {method}')

        msg = f"Tranfsering weight from old CTCLayer with method {method}"
        return msg
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''

    # Audio feature extractor
    audio_transform, feat_dim = create_transform(audio.copy())  ## from src.audio import create_transform
    ''' Returns a pytorch seq module dealing audio transform '''
    # Text tokenizer
    tokenizer = load_text_encoder(**text)                       ## from src.text import load_text_encoder
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)

        '''
Exemple #8
0
def load_dataset(n_jobs,
                 use_gpu,
                 pin_memory,
                 corpus,
                 audio,
                 inference_stage=False):
    ''' Prepare audio dataloader for solver '''
    test_set = None
    # Meta-data related
    data_msg = []
    ### Audio converter (for all kinds of transform/inverse-transform)
    audio_converter = load_audio_transform(**audio)
    data_msg.append('Audio spec.| Feature type = {}\t\t| Feature dim = {}'\
                    .format(audio_converter.feat_type,audio_converter.feat_dim))
    ### Text loader (if exist, return ground truth phone sequence)
    tokenizer = load_text_encoder('phoneme',
                                  vocab_file=corpus['vocab_file'],
                                  map_table=corpus['map_table'])
    data_msg.append('Text spec. | Token type = {}\t| Vocab size = {}'\
                    .format(tokenizer.token_type,tokenizer.vocab_size))

    # Date related
    ### Load all dataset
    unpair_set, pair_set, dev_set, test_set, set_msg = create_dataset(
        **corpus, inference_stage=inference_stage)
    data_msg.extend(set_msg)
    ### Create dataloader
    tr_collect = partial(collect_fn,
                         audio_converter=audio_converter,
                         text_loader=tokenizer,
                         mode='train')
    dv_collect = partial(collect_fn,
                         audio_converter=audio_converter,
                         text_loader=tokenizer,
                         mode='dev')

    unpair_set = DataLoader(
        unpair_set,
        batch_size=unpair_set.bs_for_collate,
        shuffle=not inference_stage,
        drop_last=not inference_stage,
        collate_fn=dv_collect if inference_stage else tr_collect,
        num_workers=max(0, n_jobs),
        pin_memory=pin_memory,
        worker_init_fn=_worker_init)

    pair_set = DataLoader(
        pair_set,
        batch_size=pair_set.bs_for_collate,
        shuffle=not inference_stage,
        drop_last=not inference_stage,
        collate_fn=dv_collect if inference_stage else tr_collect,
        num_workers=max(0, n_jobs),
        pin_memory=pin_memory,
        worker_init_fn=_worker_init)

    dev_set = DataLoader(dev_set,
                         batch_size=dev_set.bs_for_collate,
                         shuffle=False,
                         drop_last=False,
                         collate_fn=dv_collect,
                         num_workers=max(0, n_jobs),
                         pin_memory=pin_memory,
                         worker_init_fn=_worker_init)

    if inference_stage:
        test_set = DataLoader(test_set,
                              batch_size=test_set.bs_for_collate,
                              shuffle=False,
                              drop_last=False,
                              collate_fn=dv_collect,
                              num_workers=max(0, n_jobs),
                              pin_memory=pin_memory,
                              worker_init_fn=_worker_init)
    # Augmentation
    data_msg.append('Augment    | Speed rate = {}\t| S/N rate = {}'\
            .format( audio_converter.time_stretch_range, audio_converter.snr_range))

    return unpair_set, pair_set, dev_set, test_set, audio_converter, tokenizer, data_msg
Exemple #9
0
def load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text):
    ''' Prepare dataloader for training/validation'''
    # Audio feature extractor
    '''convert to mel-spectrogram'''
    audio_transform_tr, feat_dim = create_transform(audio.copy(), 'train')
    audio_transform_dv, feat_dim = create_transform(audio.copy(), 'dev')
    '''add augment function here'''
    #print(audio_transform_tr)
    #augment = Augment()
    #audio_transform_tr = Augment(audio_transform_tr)
    #### not augment dvset

    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset(
        tokenizer, ascending, **corpus)

    # Collect function
    collect_tr = partial(collect_audio_batch,
                         audio_transform=audio_transform_tr,
                         mode=mode)
    collect_dv = partial(collect_audio_batch,
                         audio_transform=audio_transform_dv,
                         mode='test')

    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader
    #print(tr_loader_bs)
    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)

    if type(dv_set) is list:
        _tmp_set = []
        for ds in dv_set:
            _tmp_set.append(
                DataLoader(ds,
                           batch_size=dv_loader_bs,
                           shuffle=False,
                           drop_last=False,
                           collate_fn=collect_dv,
                           num_workers=n_jobs,
                           pin_memory=pin_memory))
        dv_set = _tmp_set
    else:
        dv_set = DataLoader(dv_set,
                            batch_size=dv_loader_bs,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=collect_dv,
                            num_workers=n_jobs,
                            pin_memory=pin_memory)

    # Messages to show
    data_msg.append('I/O spec.  | Audio Feature = {}\t| Feature Dim = {}\t| Token Type = {}\t| Vocab Size = {}'\
                    .format(audio['feat_type'],feat_dim,tokenizer.token_type,tokenizer.vocab_size))
    return tr_set, dv_set, feat_dim, tokenizer.vocab_size, tokenizer, data_msg
def load_wav_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio,
                     text):
    # Text tokenizer
    tokenizer = load_text_encoder(**text)
    # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set)
    tr_set, dv_set, tr_loader_bs, dv_loader_bs, mode, data_msg = create_dataset(
        tokenizer, ascending, text['mode'], **corpus)
    # If mode == 'train', tr_set is the train set, dv_set is the development set
    # If mode == 'eval', tr_set is the development set, dv_set is the test set

    # Audio reader
    tr_audio_reader = ReadAudio(SAMPLE_RATE,
                                mode=mode,
                                time_aug=audio['time_aug'])
    dv_audio_reader = ReadAudio(SAMPLE_RATE,
                                mode='eval',
                                time_aug=audio['time_aug'])

    # Collect function
    collect_tr = partial(collect_wav_batch,
                         audio_reader=tr_audio_reader,
                         mode=mode)
    collect_dv = partial(collect_wav_batch,
                         audio_reader=dv_audio_reader,
                         mode='eval')

    # Shuffle/drop applied to training set only
    shuffle = (mode == 'train' and not ascending)
    drop_last = shuffle
    # Create data loader

    tr_set = DataLoader(tr_set,
                        batch_size=tr_loader_bs,
                        shuffle=shuffle,
                        drop_last=drop_last,
                        collate_fn=collect_tr,
                        num_workers=n_jobs,
                        pin_memory=use_gpu)

    if type(dv_set) is list:
        _tmp_set = []
        for ds in dv_set:
            _tmp_set.append(
                DataLoader(ds,
                           batch_size=dv_loader_bs,
                           shuffle=False,
                           drop_last=False,
                           collate_fn=collect_dv,
                           num_workers=n_jobs,
                           pin_memory=pin_memory))
        dv_set = _tmp_set
    else:
        dv_set = DataLoader(dv_set,
                            batch_size=dv_loader_bs,
                            shuffle=False,
                            drop_last=False,
                            collate_fn=collect_dv,
                            num_workers=n_jobs,
                            pin_memory=pin_memory)

    return tr_set, dv_set, tokenizer.vocab_size, tokenizer, data_msg