def create_textset(tokenizer, train_split, dev_split, name, path, bucketing, batch_size): ''' Interface for creating all kinds of text dataset''' msg_list = [] # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriTextDataset as Dataset elif name.lower() == "dlhlp": from corpus.dlhlp import DlhlpTextDataset as Dataset else: raise NotImplementedError # Create dataset bucket_size = batch_size if bucketing else 1 tr_loader_bs = 1 if bucketing else batch_size # Do not use bucketing for dev set dv_set = Dataset(path, dev_split, tokenizer, 1) tr_set = Dataset(path, train_split, tokenizer, bucket_size) # Messages to show msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set), dev_split.__str__(), len(dv_set), batch_size, bucketing) return tr_set, dv_set, tr_loader_bs, batch_size, msg_list
def create_dataset(tokenizer, ascending, name, path, bucketing, batch_size, train_split=None, dev_split=None, test_split=None): ''' Interface for creating all kinds of dataset. name: Dataset name, e.g LibriSpeech path: Dataset root dir, e.g data/LibriSpeech bucketing: batch_size: train_split: list of training data, e.g, ['train-clean-100','train-clean-360','train-other-500'] dev_split: list of validation set, e.g, ['dev-clean'] test_split: list of testing set. ''' # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriDataset as Dataset else: raise NotImplementedError # Create dataset if train_split is not None: # Training mode mode = 'train' tr_loader_bs = 1 if bucketing and (not ascending) else batch_size bucket_size = batch_size if bucketing and ( not ascending) else 1 # Ascending without bucketing # Do not use bucketing for dev set dv_set = Dataset(path, dev_split, tokenizer, 1) tr_set = Dataset(path, train_split, tokenizer, bucket_size, ascending=ascending) # Messages to show msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set), dev_split.__str__(), len(dv_set), batch_size, bucketing) return tr_set, dv_set, tr_loader_bs, batch_size, mode, msg_list else: # Testing model mode = 'test' # Do not use bucketing for dev set dv_set = Dataset(path, dev_split, tokenizer, 1) # Do not use bucketing for test set tt_set = Dataset(path, test_split, tokenizer, 1) # Messages to show msg_list = _data_msg(name, path, dev_split.__str__(), len(dv_set), test_split.__str__(), len(tt_set), batch_size, False) msg_list = [ m.replace('Dev', 'Test').replace('Train', 'Dev') for m in msg_list ] return dv_set, tt_set, batch_size, batch_size, mode, msg_list
def create_dataset(tokenizer, ascending, name, path, bucketing, batch_size, test_path, train_split=None, dev_split=None, test_split=None): ''' Interface for creating all kinds of dataset''' # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriDataset as Dataset elif name.lower() == "dlhlp": from corpus.dlhlp import DlhlpDataset as Dataset else: raise NotImplementedError # Create dataset if train_split is not None: # Training mode mode = 'train' tr_loader_bs = 1 if bucketing and (not ascending) else batch_size bucket_size = batch_size if bucketing and ( not ascending) else 1 # Ascending without bucketing # Do not use bucketing for dev set dv_set = Dataset(path, dev_split, tokenizer, 1) tr_set = Dataset(path, train_split, tokenizer, bucket_size, ascending=ascending) # Messages to show msg_list = _data_msg(name, path, train_split.__str__(), len(tr_set), dev_split.__str__(), len(dv_set), batch_size, bucketing) return tr_set, dv_set, tr_loader_bs, batch_size, mode, msg_list else: # Testing model mode = 'test' # Do not use bucketing for dev set # dv_set = Dataset(path, dev_split, tokenizer, 1) dv_set = Dataset(test_path, test_split, tokenizer, 1) # Do not use bucketing for test set print(path) tt_set = Dataset(test_path, test_split, tokenizer, 1) # Messages to show msg_list = _data_msg(name, path, dev_split.__str__(), len(dv_set), test_split.__str__(), len(tt_set), batch_size, False) msg_list = [ m.replace('Dev', 'Test').replace('Train', 'Dev') for m in msg_list ] return dv_set, tt_set, batch_size, batch_size, mode, msg_list
def repro_load_dataset(n_jobs, use_gpu, pin_memory, ascending, corpus, audio, text): ''' Prepare dataloader for testing''' # Audio feature extractor audio_transform, feat_dim = create_transform(audio.copy()) # Text tokenizer tokenizer = load_text_encoder(**text) # Dataset (in testing mode, tr_set=dv_set, dv_set=tt_set) from corpus.dlhlp import DlhlpDataset as Dataset dv_set = Dataset(corpus['path'], corpus['test_split'], tokenizer, 1) # Collect function collect_dv = partial(collect_audio_batch, audio_transform=audio_transform, mode='test') # Create data loader dv_set = DataLoader(dv_set, batch_size=1, shuffle=False, drop_last=False, collate_fn=collect_dv, num_workers=n_jobs, pin_memory=pin_memory) msg = 'I/O spec. | Audio feature = {}\t| feature dim = {}\t| Token type = {}\t| Vocab size = {}'.format( audio['feat_type'], feat_dim, tokenizer.token_type, tokenizer.vocab_size) return dv_set, feat_dim, tokenizer.vocab_size, tokenizer, msg
def _create_dataset(tokenizer, ascending, name, path, bucketing, batch_size, train_split=None, dev_split=None, test_split=None): ''' Interface for creating all kinds of dataset''' # Recognize corpus if name.lower() == "librispeech": from corpus.librispeech import LibriDataset as Dataset elif name.lower() == "dlhlp": from corpus.dlhlp import DlhlpDataset as Dataset elif name.lower() == 'external': from corpus.external import ExternalDataset as Dataset else: raise NotImplementedError # Testing model mode = 'test' # Do not use bucketing for test set tt_set = Dataset(path, test_split, tokenizer, 1) # Messages to show return tt_set, batch_size, mode, []