def D0217(data_dir, batch_size) -> Tuple[List[Tuple[int, DataLoader, DataLoader]], DataLoader, List[int]]: data = np.load(data_dir / "0201.npz") X_train = data["X_train"][:, :6] Y_train = data["Y_train"] X_test = data["X_test"][:, :6] X_train = tensor(X_train, dtype=torch.float32) Y_train = tensor(Y_train, dtype=torch.long) X_test = tensor(X_test, dtype=torch.float32) print(X_train.shape, Y_train.shape, X_test.shape) # samples_per_cls samples_per_cls = [(Y_train == i).sum().item() for i in range(61)] print(samples_per_cls) ds = C0215(X_train, Y_train) ds_test = C0215(X_test) dl_kwargs = dict(batch_size=batch_size, num_workers=6, pin_memory=True) dl_test = DataLoader(ds_test, **dl_kwargs, shuffle=False) skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=261342) dl_list = [] for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, Y_train), 1): ds_train = Subset(ds, train_idx) ds_valid = Subset(ds, valid_idx) dl_train = DataLoader(ds_train, **dl_kwargs, shuffle=True) dl_valid = DataLoader(ds_valid, **dl_kwargs, shuffle=False) dl_list.append((fold, dl_train, dl_valid)) return dl_list, dl_test, samples_per_cls
def __init__(self, args, device): super(AVMNISTSearcher, self).__init__(args) self.device = device # Handle data transformer = transforms.Compose([ avmnist_data.ToTensor(), avmnist_data.Normalize((0.1307,), (0.3081,)) ]) dataset_training = avmnist_data.AVMnist(args.datadir, transform=transformer, stage='train') dataset_validate = avmnist_data.AVMnist(args.datadir, transform=transformer, stage='train') train_indices = list(range(0, 55000)) valid_indices = list(range(55000, 60000)) train_subset = Subset(dataset_training, train_indices) valid_subset = Subset(dataset_validate, valid_indices) trainloader = torch.utils.data.DataLoader(train_subset, batch_size=args.batchsize, shuffle=False, num_workers=args.num_workers, pin_memory=True) devloader = torch.utils.data.DataLoader(valid_subset, batch_size=args.batchsize, shuffle=False, num_workers=args.num_workers, pin_memory=True) self.dataloaders = {'train': trainloader, 'dev': devloader}
def better_random_split(dataset_enhanced, dataset_clean, fraction): """ Randomly split a dataset into non-overlapping new datasets of given lengths... but better! Arguments: dataset_enhanced: The dataset with transforms dataset_clean: the dataset with only the necessary transforms and in sequential order fraction: the amount of data to be split """ assert fraction < 1, "Fraction should be < 1" assert len(dataset_enhanced) == len(dataset_clean) total_length = len(dataset_enhanced) train_length = int(fraction * total_length) eval_length = total_length - train_length val_idx0 = np.random.randint(train_length) train_idx_lst = np.append(np.arange(val_idx0), np.arange(val_idx0 + eval_length, total_length)) eval_idx_lst = np.arange(val_idx0, val_idx0 + eval_length) np.random.shuffle(train_idx_lst) return Subset(dataset_enhanced, train_idx_lst), Subset(dataset_clean, eval_idx_lst)
def cv_split(dataset, n, augmentation=None): """ Split the dataset into n non-overlapping new datasets where one is used for testing and return an array that contains the sequence of splits. Arguments: dataset (Dataset): Dataset to be split n (int): number of non-overlapping new datasets augmentation : augmentations """ cv = KFold(n_splits=n, random_state=0) res = [] for train_index, test_index in cv.split(dataset): train_set = Subset(dataset, train_index) test_set = Subset(dataset, test_index) if augmentation is not None: augmented_set = AugmentedDataSet(train_set, augmentation) else: augmented_set = train_set res.append((train_set, test_set, augmented_set)) return res
def get_data_loader(): data_loaders = {} data_set = ProjectDataset(file_path="data/triplet/triple_sentences.csv") total = len(data_set) train_indices = list(range(0, int(total * 0.9))) valid_indices = list(range(int(total * 0.9), len(data_set))) train_set = Subset(data_set, train_indices) valid_set = Subset(data_set, valid_indices) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4) dev_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4) data_loaders["train_loader"] = train_loader data_loaders["dev_loader"] = dev_loader return data_loaders
def get_train_loader(args): transform = transforms.Compose([ transforms.Grayscale(num_output_channels=3), transforms.Resize((args.size,args.size)), transforms.ToTensor(), ]) if args.dataset == 'CIFAR10': dataset = CIFAR10(root=args.data_dir,train=True, transform=transforms.Compose([ transforms.Resize((args.size,args.size)), transforms.ToTensor(), ])) dataset = Subset(dataset=dataset,indices=random.sample(range(50000),args.data_size)) elif args.dataset == 'MNIST': dataset = MNIST(root=args.data_dir,train=True,transform=transform) dataset = Subset(dataset=dataset,indices=random.sample(range(60000),args.data_size)) else: dataset = SmallNORB(root=args.data_dir,train=True,transform=transform) dataset = Subset(dataset=dataset,indices=random.sample(range(48600),args.data_size)) return DataLoader( dataset, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.workers, pin_memory=True )
def detection_dataloaders( data_dir, batch_size=1, subset_indices=None, no_augmentation=False, num_workers=0, ): train_dataset, test_dataset, num_classes = initialize_detection_datasets( data_dir, no_augmentation) if subset_indices is not None: train_dataset = Subset(train_dataset, subset_indices) test_dataset = Subset(test_dataset, subset_indices) train_dataloader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, ) test_dataloader = DataLoader( test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, ) return train_dataloader, test_dataloader, num_classes
def D0206_org_base(data_dir, batch_size, augc) -> Tuple[List[Tuple[int, DataLoader, DataLoader]], DataLoader]: data = np.load(data_dir / "0206_org.npz") X_train = data["X_train"][:, :6] Y_train = data["Y_train"] X_test = data["X_test"][:, :6] X_train = tensor(X_train, dtype=torch.float32) Y_train = tensor(Y_train, dtype=torch.long) X_test = tensor(X_test, dtype=torch.float32) print(X_train.shape, Y_train.shape, X_test.shape) ds = augc(X_train, Y_train) ds_test = TensorDataset(X_test) dl_kwargs = dict(batch_size=batch_size, num_workers=6, pin_memory=True) dl_test = DataLoader(ds_test, **dl_kwargs, shuffle=False) skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=261342) dl_list = [] for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train, Y_train), 1): ds_train = Subset(ds, train_idx) ds_valid = Subset(ds, valid_idx) dl_train = DataLoader(ds_train, **dl_kwargs, shuffle=True) dl_valid = DataLoader(ds_valid, **dl_kwargs, shuffle=False) dl_list.append((fold, dl_train, dl_valid)) return dl_list, dl_test
def get_dataset(self, n_subset=-1): cfg = self.cfg train_dataset, valid_dataset = self.get_torch_dataset() if n_subset > 0: train_dataset = Subset(train_dataset, list(range(100))) valid_dataset = Subset(valid_dataset, list(range(100))) n_train_iteration = len(train_dataset) // cfg.batch_size n_valid_iteration = len(valid_dataset) // cfg.batch_size train_dataset = self.to_tf_dataset(train_dataset, shuffle=True) valid_dataset = self.to_tf_dataset(valid_dataset, shuffle=False) iterator = Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) train_init_op = iterator.make_initializer(train_dataset) valid_init_op = iterator.make_initializer(valid_dataset) input_tensor = iterator.get_next() return ( input_tensor, (train_init_op, valid_init_op), (n_train_iteration, n_valid_iteration), )
def setup_loaders(valid_ratio, path, batch_size): dataset = Zinc(path) # split into train and valid n_samples = len(dataset) idx = np.arange(n_samples) train_samples = int((1 - valid_ratio) * n_samples) train = idx[:train_samples] valid = idx[train_samples:] train_dataset = Subset(dataset, train) valid_dataset = Subset(dataset, valid) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True) return train_loader, valid_loader, dataset
def get_dataloader(dataset): """ Make dataloader from dataset for training. """ train_size = int( len(dataset) * (1.0 - CONFIG["training"]["validation_split"])) data_loader_train = torch.utils.data.DataLoader( Subset(dataset, list(range(0, train_size))), batch_size=CONFIG["training"]["batch_size"], shuffle=CONFIG["training"]["shuffle"], drop_last=True, ) data_loader_val = torch.utils.data.DataLoader( Subset(dataset, list(range(train_size, len(dataset)))), batch_size=CONFIG["training"]["batch_size"], shuffle=False, drop_last=False, ) # dataloader of training data for evaluation only data_loader_eval_train = torch.utils.data.DataLoader( Subset(dataset, list(range(0, train_size))), batch_size=CONFIG["training"]["batch_size"], shuffle=False, drop_last=False, ) return data_loader_train, data_loader_val, data_loader_eval_train
def data_loader_with_split(root, train_split=0.9, batch_size=256, val_label_file='./val_label'): input_transform = get_transform() dataset_tr = CustomDataset(root, input_transform, target_transform, aug=True) dataset_vl = CustomDataset(root, input_transform, target_transform, aug=False) split_size = int(len(dataset_tr) * train_split) random.seed(1958) l = list(range(len(dataset_tr))) random.shuffle(l) train_idxs = l[:split_size] valid_idxs = l[split_size:] train_set = Subset(dataset_tr, train_idxs) valid_set = Subset(dataset_vl, valid_idxs) # train_set, valid_set = data.random_split(dataset, [split_size, len(dataset) - split_size]) print(len(train_set), len(valid_set)) tr_loader = data.DataLoader(dataset=train_set, batch_size=batch_size, num_workers=4, pin_memory=True, shuffle=True) val_loader = data.DataLoader(dataset=valid_set, batch_size=batch_size, num_workers=4, pin_memory=True, shuffle=False) gt_labels = [valid_set[idx][1] for idx in range(len(valid_set))] gt_labels_string = [','.join([str(s.numpy()) for s in l]) for l in list(gt_labels)] with open(val_label_file, 'w') as file_writer: file_writer.write("\n".join(gt_labels_string)) print('data_loader_with_split-') return tr_loader, val_loader, val_label_file
def get_data_loader(): data_loaders = {} data_set = ProjectDataset(file_path="data/pair/new_split_pair_all.txt") split_index = int(len(data_set) * 0.9) train_indices = list(range(0, split_index)) valid_indices = list(range(split_index, len(data_set))) train_set = Subset(data_set, train_indices) valid_set = Subset(data_set, valid_indices) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4) dev_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4) data_loaders["train_loader"] = train_loader data_loaders["dev_loader"] = dev_loader return data_loaders
def get_cross_validation_kth_fold(dataset: Dataset, k: int, n: int, start_seed: int = 17) -> tuple: """Splits the dataset into train and test subsets, accordingly to the selected number of the cross-validation fold. Parameters ---------- dataset: Dataset Dataset to split in. k : int Number of the fold to return. n : int Number of folds in the cross-validation. seed : int seed Returns ------- tuple The kth cross-validation fold. """ seed(start_seed) ids = arange(len(dataset)) split_size = int(len(dataset) / n) split_train_ids = concatenate( (ids[:split_size * k], ids[split_size * (k + 1):])) split_test_ids = ids[split_size * k:split_size * (k + 1)] train_subdatset = Subset(dataset, split_train_ids) test_subdatset = Subset(dataset, split_test_ids) return train_subdatset, test_subdatset
def get_data_loader(): data_loaders = {} data_set = ProjectDataset( anchor_file="data/triplet_encode/triple_sentence0_encode.npy", positive_file="data/triplet_encode/triple_sentence1_encode.npy", negative_file="data/triplet_encode/triple_sentence2_encode.npy") total = len(data_set) train_indices = list(range(0, int(total * 0.9))) valid_indices = list(range(int(total * 0.9), len(data_set))) train_set = Subset(data_set, train_indices) valid_set = Subset(data_set, valid_indices) train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=12) dev_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=12) data_loaders["train_loader"] = train_loader data_loaders["dev_loader"] = dev_loader return data_loaders
def train_test_split_curve(self, num_train_folds): train_indices = [] for i in range(num_train_folds): train_indices += self._fold_indices[i] dev_indices = self._fold_indices[3] test_indices = self._fold_indices[4] return (Subset(self, train_indices), Subset(self, dev_indices), Subset(self, test_indices))
def main(args=None): if args is None: args = argument_paser() # Set experiment id exp_id = str(uuid.uuid4())[:8] if args.exp_id is None else args.exp_id print(f'Experiment Id: {exp_id}', flush=True) # Fix seed torch.manual_seed(args.seed) # Config gpu use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Prepare data dataset = MovingMnistDataset() train_index, valid_index = train_test_split(range(len(dataset)), test_size=0.3) train_loader = DataLoader(Subset(dataset, train_index), batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(Subset(dataset, valid_index), batch_size=args.test_batch_size, shuffle=False) loaders = {"train": train_loader, "valid": valid_loader} model = ConvLSTMEncoderPredictor(image_size=(64, 64)).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999)) criterion = nn.MSELoss() runner = SupervisedRunner(device=catalyst.utils.get_device()) runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=None, loaders=loaders, # model will be saved to {logdir}/checkpoints logdir=os.path.join(args.log_dir, exp_id), callbacks=[ CheckpointCallback(save_n_best=args.n_saved), EarlyStoppingCallback( patience=args.es_patience, metric="loss", minimize=True, ) ], num_epochs=args.epochs, main_metric="loss", minimize_metric=True, fp16=None, verbose=True) return exp_id, model
def process_raw_dataset(train_raw_dataset, train_labels, test_raw_dataset, test_labels, raw_n_cls, dataset_name, n_cls=None, data_frac=None, biased_cls=None): """ Parameters ---------- train_raw_dataset The dataset from DataLoader test_raw_dataset The dataset from DataLoader raw_n_cls: int The number of classes the raw dataset has. dataset_name: str The name of dataset i.e. "cifar", "svhn", "imagenet" n_cls: int The number of classes you want the learning model to solve data_frac: float How many proportions the learning model uses to train itself. (0. to 1.) biased_cls: list of float (n_cls, ) The index corresponds to the index of classes. How many data to use in training. Each element of the list must be 0. to 1. Returns ------- train_dataset, test_dataset """ print("Processing raw dataset...") if n_cls is None and data_frac is None and biased_cls is None: return Subset(train_raw_dataset, train_labels[1]), Subset(test_raw_dataset, test_labels[1]) else: if n_cls is not None: print("The number of classes: {} -> {}".format(raw_n_cls, n_cls)) train_labels, test_labels = get_small_class( train_labels, test_labels, n_cls) if data_frac is not None: n_subtrain = int(np.ceil(len(train_labels[0]) * data_frac)) print("Subsampling: {} images".format(n_subtrain)) train_labels = np.array([tl[:n_subtrain] for tl in train_labels]) if biased_cls is not None: print("Biased labels") train_labels = get_biased_class(train_labels, biased_cls, n_cls, raw_n_cls) return Subset(train_raw_dataset, train_labels[1]), Subset(test_raw_dataset, test_labels[1])
def get_train_val_loaders(dataset, datapath=DATA_PATH, train_size=None, val_size=5000, train_batch_size=100, val_batch_size=1000, kwargs=None, train_transform=None, val_transform=None, train_shuffle=True, val_shuffle=False): """Support MNIST and CIFAR10""" if kwargs is None: kwargs = {} if train_transform is None: train_transform = transforms.ToTensor() if val_transform is None: val_transform = transforms.ToTensor() datapath = os.path.join(datapath, dataset) trainset = datasets.__dict__[dataset](datapath, train=True, download=True, transform=train_transform) if train_size is not None: assert train_size + val_size <= len(trainset) if val_size > 0: indices = list(range(len(trainset))) trainset = Subset(trainset, indices[val_size:]) valset = datasets.__dict__[dataset](datapath, train=True, download=True, transform=val_transform) valset = Subset(valset, indices[:val_size]) val_loader = torch.utils.data.DataLoader(valset, batch_size=val_batch_size, shuffle=val_shuffle, **kwargs) else: val_loader = None if train_size is not None: trainset = Subset(trainset, list(range(train_size))) train_loader = torch.utils.data.DataLoader(trainset, batch_size=train_batch_size, shuffle=train_shuffle, **kwargs) return train_loader, val_loader
def get_data(train_batch_size=100, test_batch_size=100, train_range=None, test_range=None, random_labels=False, seed=0): """Get CIFAR10 data. If random_labels=True, randomizes the labels. Inputs: train_batch_size (default: 100), test_batch_size (default:100), train_range (default: None), test_range (default: None), random_labels (default: False), seed (default: None) Return: train dataset, test dataset, train loader, test loader """ normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) transform_train = transforms.Compose([transforms.ToTensor(), normalize]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) train_dataset = datasets.CIFAR10(root='data', train=True, transform=transform_train, download=True) test_dataset = datasets.CIFAR10(root='data', train=False, transform=transform_test, download=True) if random_labels: print("generating random labels with seed {}".format(seed)) np.random.seed(seed) probability_of_random = 1.0 labels = np.array(train_dataset.targets) mask = np.random.rand( len(labels) ) <= probability_of_random #create mask of length labels, where entries drawn from [0,1]. rnd_labels = np.random.choice( 10, mask.sum()) #create random labels 1-10 of length of mask labels[mask] = rnd_labels labels = [int(x) for x in labels] train_dataset.targets = labels #assign new random labels to dataset np.savetxt("random_labels.txt", labels) if train_range: train_dataset = Subset(train_dataset, train_range) if test_range: test_dataset = Subset(test_dataset, test_range) train_loader = DataLoader(dataset=train_dataset, batch_size=train_batch_size, num_workers=4, shuffle=False) test_loader = DataLoader(dataset=test_dataset, batch_size=test_batch_size, num_workers=4, shuffle=False) return train_dataset, test_dataset, train_loader, test_loader
def split_train_and_test(dataset): n_samples = len(dataset) train_size = round(n_samples * 0.7) subset1_indices = list(range(0, train_size)) subset2_indices = list(range(train_size, n_samples)) train_dataset = Subset(dataset, subset1_indices) test_dataset = Subset(dataset, subset2_indices) return train_dataset, test_dataset
def randomly_split_into_two_datasets(dataset, length_of_first): import random indices = [i for i in range(len(dataset))] random.shuffle(indices) first_dataset = indices[:length_of_first] second_dataset = indices[length_of_first:] first_dataset.sort() second_dataset.sort() return [Subset(dataset, first_dataset), Subset(dataset, second_dataset)]
def devide(dataset, test_rate=0.2): total_size = len(dataset) train_size = int(total_size * (1 - test_rate)) train_indices = list(range(0, train_size)) test_indices = list(range(train_size, total_size)) train_dataset = Subset(dataset, train_indices) test_dataset = Subset(dataset, test_indices) return train_dataset, test_dataset
def split_dataset(dataset, n, seed=0): """ Return a pair of datasets corresponding to a random split of the given dataset, with n data points in the first dataset and the rest in the last, using the given random seed """ assert (n <= len(dataset)) idxes = list(range(len(dataset))) np.random.RandomState(seed).shuffle(idxes) subset_1 = idxes[:n] subset_2 = idxes[n:] return Subset(dataset, subset_1), Subset(dataset, subset_2)
def main(args=None): if args is None: args = argument_paser() # Set experiment id exp_id = str(uuid.uuid4())[:8] if args.exp_id is None else args.exp_id print(f'Experiment Id: {exp_id}', flush=True) # Fix seed torch.manual_seed(args.seed) # Set logger log_writer = SummaryWriter(log_dir=os.path.join( args.log_dir, exp_id)) if args.log_dir is not None else None # Prepare data dataset = MovingMnistDataset() train_index, valid_index = train_test_split(range(len(dataset)), test_size=0.3) train_loader = DataLoader(Subset(dataset, train_index), batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(Subset(dataset, valid_index), batch_size=args.test_batch_size, shuffle=False) # Prepare model device = 'cuda' if torch.cuda.is_available() else 'cpu' model = ConvLSTMEncoderPredictor(image_size=(64, 64)).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999)) criterion = nn.MSELoss() run(exp_id=exp_id, epochs=args.epochs, model=model, criterion=criterion, optimizer=optimizer, scheduler=None, train_loader=train_loader, valid_loader=valid_loader, device=device, writer=log_writer, log_interval=args.log_interval, n_saved=args.n_saved, save_dir=args.save_model_path, es_patience=args.es_patience) log_writer.close() return exp_id, model
def get_data_loaders(args): dataset = IKDataset(args.kinematics_pose_csv, args.joint_states_csv) train_size = int(len(dataset) * args.train_val_ratio) train_dataset = Subset(dataset, list(range(0, train_size))) val_dataset = Subset(dataset, list(range(train_size, len(dataset)))) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True) return train_loader, val_loader
def intra_class_split(self, ratio, shuffle): n_class = len(self.visible_classes) classes = [self.subset([i]) for i in range(n_class)] n_samples = [len(clss) for clss in classes] part_a, part_b = [], [] for clss, size in zip(classes, n_samples): idx = list(range(size)) if shuffle: random.shuffle(idx) thres = int(size * ratio) part_a.append(Subset(clss, idx[:thres])) part_b.append(Subset(clss, idx[thres:])) return (ConcatDatasetWithNewLabel([a for a in part_a]), ConcatDatasetWithNewLabel([b for b in part_b]))
def _split_data_to_k_fold_cv_subsets(dataset: Dataset, n_fold: int) -> List[CVPair]: n_validation_data = len(dataset) // n_fold perm = np.random.permutation(len(dataset)) cv_subsets = [] for i in range(n_fold): boolean_index = np.zeros(len(dataset)).astype(bool) p = perm[i * n_validation_data:(i + 1) * n_validation_data] boolean_index[p] = True train_subset = Subset(dataset, np.where(~boolean_index)[0]) validation_subset = Subset(dataset, np.where(boolean_index)[0]) cv_subsets.append( CVPair(train=train_subset, validation=validation_subset)) return cv_subsets
def get_train_val_loaders( root_path: str, train_transforms: Callable, val_transforms: Callable, batch_size: int = 16, num_workers: int = 8, val_batch_size: Optional[int] = None, limit_train_num_samples: Optional[int] = None, limit_val_num_samples: Optional[int] = None, ) -> Tuple[DataLoader, DataLoader, DataLoader]: train_ds = ImageNet( root_path, split="train", transform=lambda sample: train_transforms(image=sample)["image"], loader=opencv_loader ) val_ds = ImageNet( root_path, split="val", transform=lambda sample: val_transforms(image=sample)["image"], loader=opencv_loader ) if limit_train_num_samples is not None: np.random.seed(limit_train_num_samples) train_indices = np.random.permutation(len(train_ds))[:limit_train_num_samples] train_ds = Subset(train_ds, train_indices) if limit_val_num_samples is not None: np.random.seed(limit_val_num_samples) val_indices = np.random.permutation(len(val_ds))[:limit_val_num_samples] val_ds = Subset(val_ds, val_indices) # random samples for evaluation on training dataset if len(val_ds) < len(train_ds): np.random.seed(len(val_ds)) train_eval_indices = np.random.permutation(len(train_ds))[: len(val_ds)] train_eval_ds = Subset(train_ds, train_eval_indices) else: train_eval_ds = train_ds train_loader = idist.auto_dataloader( train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers, drop_last=True, ) val_batch_size = batch_size * 4 if val_batch_size is None else val_batch_size val_loader = idist.auto_dataloader( val_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False, ) train_eval_loader = idist.auto_dataloader( train_eval_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False, ) return train_loader, val_loader, train_eval_loader
def __init__(self, args, device): super(CifarSearcher, self).__init__(args) self.device = device train_indices = list(range(0, 45000)) valid_indices = list(range(45000, 50000)) # Handle data transformer_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transformer_val = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transformers = {'train': transformer_train, 'test': transformer_val} dataset_training = torchvision.datasets.CIFAR10( root=args.data_dir, train=True, download=True, transform=transformers['train']) dataset_validate = torchvision.datasets.CIFAR10( root=args.data_dir, train=True, download=True, transform=transformers['train']) train_subset = Subset(dataset_training, train_indices) valid_subset = Subset(dataset_validate, valid_indices) trainloader = torch.utils.data.DataLoader(train_subset, batch_size=args.batchsize, shuffle=True, num_workers=args.num_workers) devloader = torch.utils.data.DataLoader(valid_subset, batch_size=args.batchsize, shuffle=False, num_workers=args.num_workers) self.dataloaders = {'train': trainloader, 'dev': devloader}