def get_loader(dataset, cfg, mode='train'): assert mode in ('train', 'eval') if mode == 'train': sampler = RandomSampler(dataset, batch_size=cfg.data.samples_per_gpu, drop_last=True, seed=0) loader = DataLoader(dataset, sampler, num_workers=cfg.data.workers_per_gpu) else: samples_per_gpu = cfg.data.get('eval_samples_per_gpu', cfg.data.samples_per_gpu) workers_per_gpu = cfg.data.get('eval_workers_per_gpu', cfg.data.workers_per_gpu) if cfg.evaluation.multi_process is True: sampler = SequentialSampler(dataset, batch_size=samples_per_gpu, drop_last=False) else: sampler = SequentialSampler(dataset, batch_size=samples_per_gpu, drop_last=False, world_size=1, rank=0) loader = DataLoader(dataset, sampler, num_workers=workers_per_gpu) return loader
def get_dataloader(): instance_num = 102400 datas = [] labels = [] masks = [] for i in range(instance_num): cur_len = np.random.randint(MINLEN, MAXLEN + 1) inp_seq = np.zeros((MAXLEN + 1, len(CHARSET)), dtype='int32') cur_len = MAXLEN mask = np.zeros((MAXLEN + 1, ), dtype='int32') out_seq = np.zeros((MAXLEN + 1, ), dtype='int32') inp_seq[cur_len][len(CHARSET) - 1] = 1 out_seq[cur_len] = len(CHARSET) - 1 mask[:cur_len + 1] = 1 for j in range(cur_len): pos = np.random.randint(1, len(CHARSET) - 1) # not generate '@' and '-' inp_seq[j][pos] = 1 out_seq[cur_len - 1 - j] = pos datas.append(inp_seq) labels.append(out_seq) masks.append(mask) reverse_dataset = ArrayDataset(datas, labels, masks) random_sampler = RandomSampler(reverse_dataset, batch_size) dataloader = DataLoader(reverse_dataset, random_sampler) return dataloader
def build_dataloader(batch_size, data_dir, cfg): train_dataset = data_mapper[cfg.train_dataset["name"]]( os.path.join(data_dir, cfg.train_dataset["name"], cfg.train_dataset["root"]), os.path.join(data_dir, cfg.train_dataset["name"], cfg.train_dataset["ann_file"]), remove_images_without_annotations=True, order=["image", "boxes", "boxes_category", "info"], ) train_sampler = build_sampler(train_dataset, batch_size) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose( transforms=[ T.ShortestEdgeResize(cfg.train_image_short_size, cfg.train_image_max_size), T.RandomHorizontalFlip(), T.ToMode(), ], order=["image", "boxes", "boxes_category"], ), collator=DetectionPadCollator(), num_workers=2, ) return {"train": train_dataloader}
def build_dataloader(rank, world_size, data_root, ann_file): val_dataset = COCOJoints(data_root, ann_file, image_set="val2017", order=("image", "boxes", "info")) val_sampler = SequentialSampler(val_dataset, 1, world_size=world_size, rank=rank) val_dataloader = DataLoader( val_dataset, sampler=val_sampler, num_workers=4, transform=T.Compose( transforms=[ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), ExtendBoxes( cfg.test_x_ext, cfg.test_y_ext, cfg.input_shape[1] / cfg.input_shape[0], random_extend_prob=0, ), RandomBoxAffine( degrees=(0, 0), scale=(1, 1), output_shape=cfg.input_shape, rotate_prob=0, scale_prob=0, ), T.ToMode(), ], order=("image", "boxes", "info"), ), ) return val_dataloader
def __init__(self, input_dimension, num_points, batch_size=16, istrain=True): """ 生成如图1所示的二分类数据集,数据集长度为 num_points """ means = [0.1*n for n in range(input_dimension)] scales =[1 for n in range(input_dimension)] deviation = [0.05 * (-1 if n % 2 ==0 else 1) for n in range(input_dimension)] sd = [0.1 * (-1 if n % 2 ==0 else 1) for n in range(input_dimension)] alls = [] for i in range(input_dimension): m,s = means[i], scales[i] if not istrain: m += deviation[i] s += sd[i] cur = np.random.normal(m, s, num_points).astype(np.float32).reshape(-1, 1) print(cur) alls.append(cur) self.data = np.concatenate(alls, axis=1) super().__init__(self.data) self.random_sampler = RandomSampler(dataset=self, batch_size=batch_size, seed=1024) self.dataloader = DataLoader(dataset=self, sampler=self.random_sampler)
def build_dataloader(dataset_dir, cfg): val_dataset = PseudoDetectionDataset(length=5000, order=["image", "info"]) val_sampler = InferenceSampler(val_dataset, 1) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, num_workers=2) return val_dataloader
def build_dataloader(dataset_dir, cfg): if cfg.dataset == "VOC2012": val_dataset = EvalPascalVOC( dataset_dir, "val", order=["image", "mask", "info"] ) elif cfg.dataset == "Cityscapes": val_dataset = dataset.Cityscapes( dataset_dir, "val", mode="gtFine", order=["image", "mask", "info"] ) else: raise ValueError("Unsupported dataset {}".format(cfg.dataset)) val_sampler = InferenceSampler(val_dataset, 1) val_dataloader = DataLoader( val_dataset, sampler=val_sampler, transform=T.Normalize( mean=cfg.img_mean, std=cfg.img_std, order=["image", "mask"] ), num_workers=2, ) return val_dataloader
def build_dataloader(batch_size, dataset_dir, cfg): if cfg.dataset == "VOC2012": train_dataset = dataset.PascalVOC(dataset_dir, cfg.data_type, order=["image", "mask"]) elif cfg.dataset == "Cityscapes": train_dataset = dataset.Cityscapes(dataset_dir, "train", mode='gtFine', order=["image", "mask"]) else: raise ValueError("Unsupported dataset {}".format(cfg.dataset)) train_sampler = Infinite( RandomSampler(train_dataset, batch_size, drop_last=True)) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose( transforms=[ T.RandomHorizontalFlip(0.5), T.RandomResize(scale_range=(0.5, 2)), T.RandomCrop( output_size=(cfg.img_height, cfg.img_width), padding_value=[0, 0, 0], padding_maskvalue=255, ), T.Normalize(mean=cfg.img_mean, std=cfg.img_std), T.ToMode(), ], order=["image", "mask"], ), num_workers=2, ) return train_dataloader
def get_dataloader(instance_num=102400): #instance_num = 102400 datas = [] labels = [] positions = [] for i in range(instance_num): inp_seq = np.zeros((MAXLEN, len(CHARSET)), dtype='int32') cur_len = MAXLEN out_seq = np.zeros((MAXLEN, ), dtype='int32') pos_encoding = pos_to_query for j in range(cur_len): pos = np.random.randint(1, len(CHARSET) - 1) # not generate '@' and '-' inp_seq[j][pos] = 1 out_seq[cur_len - 1 - j] = pos datas.append(inp_seq) labels.append(out_seq) positions.append(pos_encoding) reverse_dataset = ArrayDataset(datas, labels, positions) random_sampler = RandomSampler(reverse_dataset, batch_size) dataloader = DataLoader(reverse_dataset, random_sampler) return dataloader
def build_dataloader(): train_dataset = MNIST(root=gettempdir(), train=True, download=True) dataloader = DataLoader( train_dataset, transform=Compose([Normalize(mean=0.1307 * 255, std=0.3081 * 255), Pad(2), ToMode("CHW"),]), sampler=RandomSampler(dataset=train_dataset, batch_size=64), ) return dataloader
def get_loader(dataset, cfg): samples_per_gpu = cfg.data.test_samples_per_gpu workers_per_gpu = cfg.data.test_workers_per_gpu sampler = SequentialSampler(dataset, batch_size=samples_per_gpu, drop_last=False) loader = DataLoader(dataset, sampler, num_workers=workers_per_gpu) return loader
def build_dataloader(rank, world_size, dataset_dir, cfg): val_dataset = data_mapper[cfg.test_dataset["name"]]( os.path.join(dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["root"]), os.path.join(dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"]), order=["image", "info"], ) val_sampler = InferenceSampler(val_dataset, 1, world_size=world_size, rank=rank) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, num_workers=2) return val_dataloader
def build_dataloader(rank, world_size, data_dir): val_dataset = COCODataset( os.path.join(data_dir, "val2017"), os.path.join(data_dir, "annotations/instances_val2017.json"), order=["image", "info"], ) val_sampler = SequentialSampler(val_dataset, 1, world_size=world_size, rank=rank) val_dataloader = DataLoader(val_dataset, sampler=val_sampler, num_workers=2) return val_dataloader
def build_dataset(root=Path('/home/zqh/data/omniglot-py'), nway=5, kshot=1, kquery=1, batch_size=32): train_ds = OmniglotDataset(root, nway, kshot, kquery, mode='train') train_smp = SequentialSampler(train_ds, drop_last=True, batch_size=batch_size) train_loader = DataLoader(train_ds, sampler=train_smp, num_workers=4) val_ds = OmniglotDataset(root, nway, kshot, kquery, mode='val') val_smp = SequentialSampler(train_ds, drop_last=True, batch_size=batch_size) val_loader = DataLoader(val_ds, sampler=val_smp, num_workers=4) return train_loader, val_loader
def fetch_dataloader(params): input_transform = fetch_input_transform() spatial_transform = fetch_spatial_transform(params) benchmark_path_gof_clean = "dataset/GOF_Clean.npy" benchmark_path_gof_final = "dataset/GOF_Final.npy" if params.dataset_type == "GOF": train_ds = BaseDataset(input_transform, spatial_transform) val_ds = TestDataset(benchmark_path_gof_clean, input_transform) test_ds = ConcatDataset( [TestDataset(benchmark_path_gof_clean, input_transform), TestDataset(benchmark_path_gof_final, input_transform)]) dataloaders = {} # add defalt train data loader train_sampler = RandomSampler(train_ds, batch_size=params.train_batch_size, drop_last=True) train_dl = DataLoader(train_ds, train_sampler, num_workers=params.num_workers) dataloaders["train"] = train_dl # chosse val or test data loader for evaluate for split in ["val", "test"]: if split in params.eval_type: if split == "val": val_sampler = SequentialSampler(val_ds, batch_size=params.eval_batch_size) dl = DataLoader(val_ds, val_sampler, num_workers=params.num_workers) elif split == "test": test_sampler = SequentialSampler(test_ds, batch_size=params.eval_batch_size) dl = DataLoader(test_ds, test_sampler, num_workers=params.num_workers) else: raise ValueError("Unknown eval_type in params, should in [val, test]") dataloaders[split] = dl else: dataloaders[split] = None return dataloaders
def __init__(self, num_points, batch_size=16): """ 生成如图1所示的二分类数据集,数据集长度为 num_points """ # 初始化一个维度为 (50000, 2) 的 NumPy 数组。 # 数组的每一行是一个横坐标和纵坐标都落在 [-1, 1] 区间的一个数据点 (x, y) # np.random.seed(2020) self.data = np.random.rand(num_points, 2).astype(np.float32) * 2 - 1 # 为上述 NumPy 数组构建标签。每一行的 (x, y) 如果符合 x*y < 0,则对应标签为1,反之,标签为0 self.label = np.zeros(num_points, dtype=np.int32) for i in range(num_points): self.label[i] = 1 if np.prod(self.data[i]) < 0 else 0 super().__init__(self.data, self.label) self.random_sampler = RandomSampler(dataset=self, batch_size=batch_size, seed=1024) self.dataloader = DataLoader(dataset=self, sampler=self.random_sampler)
def get_dataloader(self, examples, batch_size, is_random=False): features = convert_examples_to_features( examples, self.label_list, self.args.max_seq_length, self.tokenizer ) all_input_ids, all_input_mask, all_segment_ids, all_label_ids = self.to_inputs( features ) dataset = ArrayDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids ) if is_random: sampler = RandomSampler( dataset=dataset, batch_size=batch_size, drop_last=True ) else: sampler = SequentialSampler( dataset=dataset, batch_size=batch_size, drop_last=True ) dataloader = DataLoader(dataset=dataset, sampler=sampler,) return dataloader, len(features)
def build_dataloader(batch_size, data_dir, cfg): train_dataset = build_dataset(data_dir, cfg) train_sampler = build_sampler(train_dataset, batch_size) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose( transforms=[ T.ShortestEdgeResize( cfg.train_image_short_size, cfg.train_image_max_size, sample_style="choice", ), T.RandomHorizontalFlip(), T.ToMode(), ], order=["image", "boxes", "boxes_category"], ), collator=DetectionPadCollator(), num_workers=2, ) return train_dataloader
# dataset root_dir = '/data/.cache/dataset/MNIST' mnist_train_dataset = MNIST(root=root_dir, train=True, download=False) mnist_test_dataset = MNIST(root=root_dir, train=False, download=False) random_sampler = RandomSampler(dataset=mnist_train_dataset, batch_size=256) sequential_sampler = SequentialSampler(dataset=mnist_test_dataset, batch_size=256) mnist_train_dataloader = DataLoader( dataset=mnist_train_dataset, sampler=random_sampler, transform=Compose([ RandomResizedCrop(output_size=28), # mean 和 std 分别是 MNIST 数据的均值和标准差,图片数值范围是 0~255 #Normalize(mean=0.1307*255, std=0.3081*255), #Pad(2), # 'CHW'表示把图片由 (height, width, channel) 格式转换成 (channel, height, width) 格式 #ToMode('CHW'), ])) mnist_test_dataloader = DataLoader( dataset=mnist_test_dataset, sampler=sequential_sampler, ) # model from model import get_net net = get_net()
print(len(train_dataset), len(train_label)) print(len(test_dataset), len(test_label)) from typing import Tuple from megengine.data.dataset import Dataset class BostonTrainDataset(Dataset): def __init__(self): self.data = train_dataset self.label = train_label def __getitem__(self, index: int) -> Tuple: return self.data[index], self.label[index] def __len__(self) -> int: return len(self.data) boston_train_dataset = BostonTrainDataset() print(len(boston_train_dataset)) from megengine.data import DataLoader from megengine.data import SequentialSampler sequential_sampler = SequentialSampler(dataset=boston_train_dataset, batch_size=100) train_dataloader = DataLoader(dataset=boston_train_dataset, sampler=sequential_sampler) for batch_data, batch_label in train_dataloader: print(batch_data.shape, batch_label.shape, len(train_dataloader)) break
from megengine.data import DataLoader from megengine.data.transform import ToMode, Pad, Normalize, Compose from megengine.data.sampler import RandomSampler, SequentialSampler # 如果使用 MegStudio 环境,请将 MNIST_DATA_PATH 为 /home/megstudio/dataset/MNIST/ MNIST_DATA_PATH = "./datasets/MNIST/" # 获取训练数据集,如果本地没有数据集,请将 download 参数设置为 True train_dataset = MNIST(root=MNIST_DATA_PATH, train=True, download=False) test_dataset = MNIST(root=MNIST_DATA_PATH, train=False, download=False) batch_size = 64 # 创建 Sampler train_sampler = RandomSampler(train_dataset, batch_size=batch_size) test_sampler = SequentialSampler(test_dataset, batch_size=batch_size) # 数据预处理方式 transform = Compose([ Normalize(mean=0.1307 * 255, std=0.3081 * 255), Pad(2), ToMode('CHW'), ]) # 创建 Dataloader train_dataloader = DataLoader(train_dataset, train_sampler, transform) test_dataloader = DataLoader(test_dataset, test_sampler, transform) for X, y in train_dataloader: print("Shape of X: ", X.shape) # [N, C, H, W] print("Shape of y: ", y.shape, y.dtype) break
path = "unet_j.mge" @trace def train_func(data, label, net=None, optimizer=None): net.train() pred = net(data) loss = F.cross_entropy_with_softmax(pred, label) optimizer.backward(loss) return pred, loss train_dataset = u_data("./data/train", order=["image", "mask"]) dataloader = DataLoader(train_dataset, transform=Compose([ToMode('CHW')]), sampler=RandomSampler(dataset=train_dataset, batch_size=4, drop_last=True)) unet = Unet(1, 4) optimizer = optim.SGD(unet.parameters(), lr=0.05) trace.enabled = True total_epochs = 50 loss_src = 100000000 for epoch in range(total_epochs): total_loss = 0 correct = 0 total = 0 sta = time.time()
return logits, loss # 静态图比动态图快了很多 trace.enabled = True if __name__ == '__main__': np.random.seed(39) train_dataset = XORDataset(30000) test_dataset = XORDataset(10000) # 这里为什么要传两次train_dataset train_sampler = RandomSampler(dataset=train_dataset, batch_size=32, drop_last=True) train_loader = DataLoader(train_dataset, sampler=train_sampler) test_sampler = SequentialSampler(dataset=test_dataset, batch_size=32, drop_last=False) test_loader = DataLoader(test_dataset, sampler=test_sampler) # draw_dataset(train_dataset) model = build_model() optimizer = optim.SGD( model.parameters(), lr=0.01, ) data = mge.tensor()