def _multi_instances_parallel_dataloader_worker(): dataset = init_dataset() for divide_flag in [True, False]: train_dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False), num_workers=2, divide=divide_flag, preload=True, ) val_dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=10, drop_last=False), num_workers=2, divide=divide_flag, preload=True, ) for idx, (data, label) in enumerate(train_dataloader): assert data._tuple_shape == (4, 1, 32, 32) assert label._tuple_shape == (4,) if idx % 5 == 0: for val_data, val_label in val_dataloader: assert val_data._tuple_shape == (10, 1, 32, 32) assert val_label._tuple_shape == (10,)
def test_dataloader_init(): dataset = init_dataset() with pytest.raises(ValueError): dataloader = DataLoader(dataset, num_workers=2, divide=True) with pytest.raises(ValueError): dataloader = DataLoader(dataset, num_workers=-1) with pytest.raises(ValueError): dataloader = DataLoader(dataset, timeout=-1) with pytest.raises(ValueError): dataloader = DataLoader(dataset, num_workers=0, divide=True) dataloader = DataLoader(dataset) assert isinstance(dataloader.sampler, SequentialSampler) assert isinstance(dataloader.transform, PseudoTransform) assert isinstance(dataloader.collator, Collator) dataloader = DataLoader(dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=False)) assert len(dataloader) == 17 dataloader = DataLoader(dataset, sampler=RandomSampler(dataset, batch_size=6, drop_last=True)) assert len(dataloader) == 16
def test_dataloader_parallel(): # set max shared memory to 100M os.environ["MGE_PLASMA_MEMORY"] = "100000000" dataset = init_dataset() dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False), num_workers=2, divide=False, preload=True, ) for (data, label) in dataloader: assert data._tuple_shape == (4, 1, 32, 32) assert label._tuple_shape == (4,) dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False), num_workers=2, divide=True, preload=True, ) for (data, label) in dataloader: assert data._tuple_shape == (4, 1, 32, 32) assert label._tuple_shape == (4,)
def test_RandomSampler(): indices = list(range(20)) indices_copy = copy.deepcopy(indices) sampler = RandomSampler(ArrayDataset(indices_copy)) sample_indices = sampler assert indices != list(each[0] for each in sample_indices) assert indices == sorted(list(each[0] for each in sample_indices))
def test_random_sampler_seed(): seed = [0, 1] indices = list(range(20)) indices_copy1 = copy.deepcopy(indices) indices_copy2 = copy.deepcopy(indices) indices_copy3 = copy.deepcopy(indices) sampler1 = RandomSampler(ArrayDataset(indices_copy1), seed=seed[0]) sampler2 = RandomSampler(ArrayDataset(indices_copy2), seed=seed[0]) sampler3 = RandomSampler(ArrayDataset(indices_copy3), seed=seed[1]) assert indices != list(each[0] for each in sampler1) assert indices != list(each[0] for each in sampler2) assert indices != list(each[0] for each in sampler3) assert indices == sorted(list(each[0] for each in sampler1)) assert indices == sorted(list(each[0] for each in sampler2)) assert indices == sorted(list(each[0] for each in sampler3)) assert list(each[0] for each in sampler1) == list(each[0] for each in sampler2) assert list(each[0] for each in sampler1) != list(each[0] for each in sampler3)
def test_dataloader_serial(): dataset = init_dataset() dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False) ) for (data, label) in dataloader: assert data.shape == (4, 1, 32, 32) assert label.shape == (4,)
def test_dataloader_parallel_worker_exception(): dataset = init_dataset() class FakeErrorTransform(Transform): def __init__(self): pass def apply(self, input): y = x + 1 return input dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False), transform=FakeErrorTransform(), num_workers=2, ) with pytest.raises(RuntimeError, match=r"worker.*died"): data_iter = iter(dataloader) batch_data = next(data_iter)
def get_dataloader(self, examples, batch_size, is_random=False): features = convert_examples_to_features( examples, self.label_list, self.args.max_seq_length, self.tokenizer ) all_input_ids, all_input_mask, all_segment_ids, all_label_ids = self.to_inputs( features ) dataset = ArrayDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids ) if is_random: sampler = RandomSampler( dataset=dataset, batch_size=batch_size, drop_last=True ) else: sampler = SequentialSampler( dataset=dataset, batch_size=batch_size, drop_last=True ) dataloader = DataLoader(dataset=dataset, sampler=sampler,) return dataloader, len(features)
def test_dataloader_parallel_timeout(): dataset = init_dataset() class TimeoutTransform(Transform): def __init__(self): pass def apply(self, input): time.sleep(10) return input dataloader = DataLoader( dataset, sampler=RandomSampler(dataset, batch_size=4, drop_last=False), transform=TimeoutTransform(), num_workers=2, timeout=2, ) with pytest.raises(RuntimeError, match=r".*timeout.*"): data_iter = iter(dataloader) batch_data = next(data_iter)
def fetch_dataloader(params): input_transform = fetch_input_transform() spatial_transform = fetch_spatial_transform(params) benchmark_path_gof_clean = "dataset/GOF_Clean.npy" benchmark_path_gof_final = "dataset/GOF_Final.npy" if params.dataset_type == "GOF": train_ds = BaseDataset(input_transform, spatial_transform) val_ds = TestDataset(benchmark_path_gof_clean, input_transform) test_ds = ConcatDataset( [TestDataset(benchmark_path_gof_clean, input_transform), TestDataset(benchmark_path_gof_final, input_transform)]) dataloaders = {} # add defalt train data loader train_sampler = RandomSampler(train_ds, batch_size=params.train_batch_size, drop_last=True) train_dl = DataLoader(train_ds, train_sampler, num_workers=params.num_workers) dataloaders["train"] = train_dl # chosse val or test data loader for evaluate for split in ["val", "test"]: if split in params.eval_type: if split == "val": val_sampler = SequentialSampler(val_ds, batch_size=params.eval_batch_size) dl = DataLoader(val_ds, val_sampler, num_workers=params.num_workers) elif split == "test": test_sampler = SequentialSampler(test_ds, batch_size=params.eval_batch_size) dl = DataLoader(test_ds, test_sampler, num_workers=params.num_workers) else: raise ValueError("Unknown eval_type in params, should in [val, test]") dataloaders[split] = dl else: dataloaders[split] = None return dataloaders
from megengine.data.dataset import MNIST from megengine.data import DataLoader from megengine.data.transform import ToMode, Pad, Normalize, Compose from megengine.data.sampler import RandomSampler, SequentialSampler # 如果使用 MegStudio 环境,请将 MNIST_DATA_PATH 为 /home/megstudio/dataset/MNIST/ MNIST_DATA_PATH = "./datasets/MNIST/" # 获取训练数据集,如果本地没有数据集,请将 download 参数设置为 True train_dataset = MNIST(root=MNIST_DATA_PATH, train=True, download=False) test_dataset = MNIST(root=MNIST_DATA_PATH, train=False, download=False) batch_size = 64 # 创建 Sampler train_sampler = RandomSampler(train_dataset, batch_size=batch_size) test_sampler = SequentialSampler(test_dataset, batch_size=batch_size) # 数据预处理方式 transform = Compose([ Normalize(mean=0.1307 * 255, std=0.3081 * 255), Pad(2), ToMode('CHW'), ]) # 创建 Dataloader train_dataloader = DataLoader(train_dataset, train_sampler, transform) test_dataloader = DataLoader(test_dataset, test_sampler, transform) for X, y in train_dataloader: print("Shape of X: ", X.shape) # [N, C, H, W]