def kfold_generator(args, splits, dataset): import torch.utils.data as data from sklearn.model_selection import KFold splitter = KFold(n_splits=splits, shuffle=True) for fold_index, (train_subset, test_subset) in enumerate(splitter.split(dataset)): train_sampler = data.SubsetRandomSampler(train_subset) test_sampler = data.SubsetRandomSampler(test_subset) train_loader = torch.utils.data.DataLoader( dataset, sampler=train_sampler, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=None) test_loader = torch.utils.data.DataLoader(dataset, sampler=test_sampler, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=None) yield fold_index, train_loader, len(train_subset), test_loader, len( test_subset)
def __init__(self, data_file, batch_size, test_split, shuffle_dataset, random_seed, validation_split=0): # Load tensor data data = torch.load(data_file) dataset = IndexTensorDataset(data['X'], data['y']) # Test / train split dataset_size = len(dataset) indices = list(range(dataset_size)) split = int(np.floor(test_split * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, test_indices = indices[split:], indices[:split] # Initialize Dataloaders train_sampler = data_utils.SubsetRandomSampler(train_indices) test_sampler = data_utils.SubsetRandomSampler(test_indices) self.train_loader = data_utils.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) self.test_loader = data_utils.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler) self.isolates = data['isolates']
def execute_model(network_function, criterion, device, print_details=False): acc_list = [] auc_list = [] for _ in range(METRIC_COMPUTATION_ITER): network = network_function().to(device) optimiser = torch.optim.Adam(network.parameters(), lr=LEARNING_RATE) df = Dataset(PATH) end = int(df.__len__()) indices = list([i for i in range(0, end)]) set_split = end - round(end * SET_RATIO) train_indices = indices[0:set_split] test_indices = indices[set_split:end] training_data = data.DataLoader( df, batch_size=TRAIN_BATCH_SIZE, sampler=data.SubsetRandomSampler(train_indices)) test_data = data.DataLoader( df, batch_size=TEST_BATCH_SIZE, sampler=data.SubsetRandomSampler(test_indices)) training_data_batches = len(training_data) test_data_batches = len(test_data) for epoch in range(EPOCH): running_loss = 0 for i, batch in enumerate(training_data): inputs, labels = batch inputs, labels = inputs.to(device), labels.to(device) optimiser.zero_grad() outputs = network(inputs) loss = criterion(outputs, labels.type_as(outputs)) loss.backward() optimiser.step() running_loss += loss.item() if print_details: if i % training_data_batches == training_data_batches - 1: print("Epoch : %2d, Loss : %.3f" % (epoch + 1, running_loss)) evaluate_model(network, training_data, 'training data', device) acc_tmp, auc_tmp = evaluate_model(network, test_data, 'test data', device) acc_list.append(acc_tmp) auc_list.append(auc_tmp) write_metrics(acc_list, auc_list)
def load_torch_data(dataset: tdata.Dataset, ratio: float, bs: int): """Prepare data from torch dataset for training and validation. Args: dataset (torch.utils.data.Dataset): loaded dataset ratio (float): split ratio bs (int): batch size Returns: A tuple of training data loader, validation data loader and a tuple of size containing training dataset size and validation dataset size respectively """ dataset_size = len(dataset) # prepare for shuffle indices = np.arange(dataset_size) np.random.shuffle(indices) split_idx = int(np.floor(ratio * dataset_size)) train_indices, val_indices = indices[split_idx:], indices[:split_idx] # split dataset train_sampler = tdata.SubsetRandomSampler(train_indices) val_sampler = tdata.SubsetRandomSampler(val_indices) train_loader = tdata.DataLoader(dataset, batch_size=bs, sampler=train_sampler) val_loader = tdata.DataLoader(dataset, batch_size=bs, sampler=val_sampler) return train_loader, val_loader, (len(train_indices), len(val_indices))
def split_dataset(dataset, batch_size): data_size = len(dataset) validation_split = .2 shuffle = True random_seed = 42 indices = list(range(data_size)) #print(validation_split * data_size) split = int(np.floor(validation_split * data_size)) if shuffle: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sample = data.SubsetRandomSampler(train_indices) validation_sample = data.SubsetRandomSampler(val_indices) train_loader = data.DataLoader(dataset, batch_size=batch_size, sampler=train_sample, num_workers=8) val_loader = data.DataLoader(dataset, batch_size=batch_size, sampler=validation_sample, num_workers=8) return train_loader, val_loader
def test_split(self): t0 = time() train_ids, test_ids = self.dataset.get_train_test_split() self.assertEqual( train_ids.size(0) + test_ids.size(0), len(self.dataset)) self.dataset.transform = None train_loader = thd.DataLoader( self.dataset, batch_size=1, sampler=thd.SubsetRandomSampler(train_ids)) test_loader = thd.DataLoader(self.dataset, batch_size=1, sampler=thd.SubsetRandomSampler(test_ids)) loader_train_ids, _ = torch.tensor( [batch[2][0] for batch in train_loader]).sort() loader_test_ids, _ = torch.tensor( [batch[2][0] for batch in test_loader]).sort() self.assertEqual( loader_train_ids.eq(train_ids.sort()[0]).sum(), train_ids.size(0)) self.assertEqual( loader_test_ids.eq(test_ids.sort()[0]).sum(), test_ids.size(0)) # Check that test IDs do not leak with training self.assertEqual( np.intersect1d(loader_train_ids.numpy(), test_ids.numpy()).shape[0], 0) self.assertEqual( np.intersect1d(loader_train_ids.numpy(), loader_test_ids.numpy()).shape[0], 0) self.assertEqual( np.intersect1d(loader_test_ids.numpy(), train_ids.numpy()).shape[0], 0) print("Split: %.2fs" % (time() - t0))
def train_test_split (self, test_size=0.3, shuffle=True,random_state=None): """returns indices to split train/test""" d_i = np.arange (self.n) train_i, test_i = train_test_split (d_i, test_size=test_size, shuffle=shuffle, stratify=self.target, random_state=random_state) train_s = tud.SubsetRandomSampler (train_i) test_s = tud.SubsetRandomSampler (test_i) return train_s, test_s
def get_loader(root_folder, batch_size=16, shuffle=False, num_workers=0, pin_memory=False): """ Returns a data loader for the caltech 101 dataset """ cal101_dset = get_dataset(root_folder) # train test split split_ratio = 0.2 dataset_size = len(cal101_dset) indices = np.arange(dataset_size) np.random.shuffle(indices) split = int(np.floor(split_ratio * dataset_size)) train_indices, val_indices = indices[split:], indices[:split] train_sampler = data.SubsetRandomSampler(train_indices) valid_sampler = data.SubsetRandomSampler(val_indices) train_loader = data.DataLoader(cal101_dset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, sampler=train_sampler, pin_memory=pin_memory) validation_loader = data.DataLoader(cal101_dset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, sampler=valid_sampler, pin_memory=pin_memory) return train_loader, validation_loader
def mean_teacher( dataset_root, supervised_ratio: float = 0.1, batch_size: int = 64, train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9), val_folds: tuple = (10, ), verbose=1, **kwargs, ): assert supervised_ratio <= 1.0 """ Load the UrbanSound dataset for student teacher framework. """ audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio") metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata") all_folds = train_folds + val_folds # Create the dataset manager manager = DatasetManager(metadata_root, audio_root, folds=all_folds, verbose=verbose) # validation subset val_dataset = Dataset(manager, folds=val_folds, cached=True) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True) # training subset train_dataset = Dataset(manager, folds=train_folds, cached=True) # Calc the size of the Supervised and Unsupervised batch s_idx, u_idx = train_dataset.split_s_u(supervised_ratio) nb_s_file = len(s_idx) nb_u_file = len(u_idx) s_batch_size = int(np.floor(batch_size * supervised_ratio)) u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio))) print("s_batch_size: ", s_batch_size) print("u_batch_size: ", u_batch_size) sampler_s = torch_data.SubsetRandomSampler(s_idx) sampler_u = torch_data.SubsetRandomSampler(u_idx) train_s_loader = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s) train_u_loader = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u) train_loader = ZipCycle([train_s_loader, train_u_loader]) return manager, train_loader, val_loader
def dct(dataset_root, supervised_ratio: float = 0.1, batch_size: int = 100, train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9), val_folds: tuple = (10, ), verbose=1, **kwargs): """ Load the urbansound dataset for Deep Co Training system. """ audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio") metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata") all_folds = train_folds + val_folds # Create the dataset manager manager = DatasetManager(metadata_root, audio_root, folds=all_folds, verbose=verbose) # prepare the default dataset train_dataset = Dataset(manager, folds=train_folds, cached=True) val_dataset = Dataset(manager, folds=val_folds, cached=True) # split the training set into a supervised and unsupervised sets s_idx, u_idx = train_dataset.split_s_u(supervised_ratio) # Calc the size of the Supervised and Unsupervised batch nb_s_file = len(s_idx) nb_u_file = len(u_idx) s_batch_size = int(np.floor(batch_size * supervised_ratio)) u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio))) # create the sampler, the loader and "zip" them sampler_s1 = torch_data.SubsetRandomSampler(s_idx) sampler_s2 = torch_data.SubsetRandomSampler(s_idx) sampler_u = torch_data.SubsetRandomSampler(u_idx) train_loader_s1 = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s1) train_loader_s2 = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s2) train_loader_u = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u) train_loader = ZipCycle([train_loader_s1, train_loader_s2, train_loader_u]) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True) return manager, train_loader, val_loader
def mean_teacher(dataset_root, supervised_ratio: float = 0.1, batch_size: int = 128, train_folds: tuple = (1, 2, 3, 4), val_folds: tuple = (5, ), train_transform: Module = None, val_transform: Module = None, **kwargs) -> Tuple[None, DataLoader, DataLoader]: """ Load the cifar10 dataset for Deep Co Training system. """ # Recover extra commun arguments num_workers = kwargs.get("num_workers", 0) pin_memory = kwargs.get("pin_memory", False) loader_args = dict( num_workers=num_workers, pin_memory=pin_memory, ) dataset_path = os.path.join(dataset_root) # validation subset val_dataset = cls(root=dataset_path, folds=val_folds, download=True, transform=val_transform) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, **loader_args) # Training subset train_dataset = cls(root=dataset_path, folds=train_folds, download=True, transform=train_transform) s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio, nb_class=train_dataset.nb_class) s_batch_size = int(np.floor(batch_size * supervised_ratio)) u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio))) sampler_s = torch_data.SubsetRandomSampler(s_idx) sampler_u = torch_data.SubsetRandomSampler(u_idx) train_s_loader = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s) train_u_loader = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u) train_loader = ZipCycle([train_s_loader, train_u_loader]) return None, train_loader, val_loader
def load_cifar10(root_dir=None, batch_size=20, shuffle=True, transform=None, download=True): dataset_type = "continuous" if root_dir is None: root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets' root_dir = str(root_dir) if transform is None: transform = transforms.ToTensor() train_dataset = datasets.CIFAR10(root_dir, transform=transform, download=download) size_train = len(train_dataset) indices = list(range(size_train)) split = int(np.floor(0.2 * size_train)) if split % batch_size != 0: raise ValueError( f'The batch size: {batch_size} does not divide the size of ' f'the train_dataset: {size_train - split} or the size of the validation_dataset: {split}' ) if shuffle: np.random.shuffle(indices) train_idx, valid_idx = indices[split:], indices[:split] train_sampler = data_utils.SubsetRandomSampler(train_idx) valid_sampler = data_utils.SubsetRandomSampler(valid_idx) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, shuffle=shuffle) valid_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=valid_sampler, shuffle=shuffle) test_loader = torch.utils.data.DataLoader(datasets.MNIST( root_dir, train=False, transform=transforms.Compose([transforms.ToTensor(), transform])), batch_size=batch_size, shuffle=shuffle) return train_loader, test_loader, valid_loader, dataset_type
def get_preprocessed_data(self): dataset = self.get_dataset() train_indices, val_indices = self.get_split(dataset) train_sampler = data.SubsetRandomSampler(train_indices) valid_sampler = data.SubsetRandomSampler(val_indices) train_loader = data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler) validation_loader = data.DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler) return train_loader, validation_loader
def loadDataForLocal(want_to_test): training_dataset = NSynth( "./nsynth-test", transform=toFloat, blacklist_pattern=["synth_lead"], # blacklist string instrument categorical_field_list=["instrument_family", "instrument_source"]) # Splitting training dataset into training and validation and testing num_train = len(training_dataset) indices = list(range(num_train)) splitVal = int(np.floor(VALIDATION_SPLIT * num_train)) splitTest = int(np.floor(TESTING_SPLIT * num_train)) + splitVal # Make sure you get same numbers every time when rand_seed = 0 np.random.seed(seed=RAND_SEED) # Shuffle the indices np.random.shuffle(indices) # Get training set index and validation set index validation_idx, test_idx, train_idx = indices[splitVal:], \ indices[splitVal:splitTest], \ indices[splitTest:] # create samplers train_sampler = data_utils.SubsetRandomSampler(train_idx) test_sampler = data_utils.SubsetRandomSampler(test_idx) validation_sampler = data_utils.SubsetRandomSampler(validation_idx) # create dataLoaders train_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=BATCH_SIZE, sampler=train_sampler) validation_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=1, sampler=validation_sampler) test_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=1, sampler=test_sampler) if want_to_test == '1': test_loader = torch.utils.data.DataLoader(dataset=training_dataset, batch_size=1) print('Finished preparing data loaders for local testing') return train_loader, validation_loader, test_loader
def split_data(data, validation_split=0.1, batch_size=100): dataset_size = len(data) indices = list(range(dataset_size)) split = int(np.floor(validation_split * dataset_size)) random_seed = 37 np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_loader = utils.DataLoader(data, batch_size=batch_size, sampler=utils.SubsetRandomSampler(train_indices)) validation_loader = utils.DataLoader(data, batch_size=1, sampler=utils.SubsetRandomSampler(val_indices)) return train_loader, validation_loader
def get_loaders(dataset, config, logger, n=1): if n == 2: t0 = time.time() train_indices = dataset.get_train_indices(p=config.PAIR_SPLIT_P) logger.info("Indices created at %.2fs" % (time.time() - t0)) loaders = { "train": thd.DataLoader(d.PairExtension(dataset), batch_size=config.BATCH_SIZE, sampler=thd.SubsetRandomSampler(train_indices), collate_fn=d.pair_collate, num_workers=config.NW), } elif n == 3: train_indices, test_indices = dataset.get_train_test_split() if dataset.transform is None: collate_fn = d.triple_collate else: collate_fn = d.triple_collate_pil loaders = { "train": thd.DataLoader(d.TripletExtension(dataset), batch_size=config.BATCH_SIZE, sampler=thd.SubsetRandomSampler(train_indices), collate_fn=collate_fn, num_workers=config.NW), } else: train_indices, test_indices = dataset.get_train_test_split() if dataset.transform is None: collate_fn = d.fast_collate else: collate_fn = d.fast_collate_pil loaders = { "train": thd.DataLoader(dataset, batch_size=config.EVAL_BATCH_SIZE, sampler=thd.SubsetRandomSampler(train_indices), collate_fn=collate_fn, num_workers=config.NW), "test": thd.DataLoader(dataset, batch_size=config.EVAL_BATCH_SIZE, sampler=thd.SubsetRandomSampler(test_indices), collate_fn=collate_fn, num_workers=config.NW) } return loaders
def create_dataloader_train_cv( kaldi_string, caption_json_path, vocab_path, transform=None, shuffle=True, batch_size: int = 16, num_workers=1, percent=90, ): dataset = SJTUDataLoader(kaldi_string=kaldi_string, caption_json_path=caption_json_path, vocab_path=vocab_path, transform=transform) all_indices = torch.arange(len(dataset)) num_train_indices = int(len(all_indices) * percent / 100) train_indices = all_indices[:num_train_indices] cv_indices = all_indices[num_train_indices:] trainsampler = data.SubsetRandomSampler(train_indices) # Do not shuffle cvsampler = SubsetSampler(cv_indices) return data.DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, sampler=trainsampler), data.DataLoader( dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn, sampler=cvsampler)
def get_data_and_train_model(file_path, network, criterion, optimiser, device, print_details): df = Dataset(file_path) end = int(df.__len__()) indices = list([i for i in range(0, end)]) set_split = end - round(end * SET_RATIO) train_indices = indices[0:set_split] test_indices = indices[set_split:end] training_data = data.DataLoader( df, batch_size=TRAIN_BATCH_SIZE, sampler=data.SubsetRandomSampler(train_indices)) test_data = data.DataLoader(df, batch_size=TEST_BATCH_SIZE, sampler=data.SubsetRandomSampler(test_indices)) training_data_batches = len(training_data) test_data_batches = len(test_data) for epoch in range(EPOCH): running_loss = 0 for i, batch in enumerate(training_data): inputs, labels = batch inputs, labels = inputs.to(device), labels.to(device) optimiser.zero_grad() outputs = network(inputs) loss = criterion(outputs, labels.type_as(outputs)) loss.backward() optimiser.step() running_loss += loss.item() if print_details: if i % training_data_batches == training_data_batches - 1: print("Epoch : %2d, Loss : %.3f" % (epoch + 1, running_loss)) return training_data, test_data
def set_training_data(self, job, train_inds, test_inds, labels, data): """Construct generators out of the dataset for training, validation, and expectation maximization. Parameters ---------- job : dict See training_dict.tx for all keys. train_inds : np.ndarray Indices in data that are to be trained on test_inds : np.ndarray Indices in data that are to be validated on labels : np.ndarray, classification labels used for training data : np.ndarray, shape=(n_frames,3*n_atoms) OR str to path All data """ batch_size = job['batch_size'] cpu_cores = job['em_n_cores'] test_batch_size = job['test_batch_size'] em_batch_size = job['em_batch_size'] subsample = job['subsample'] data_dir = job["data_dir"] n_train_inds = len(train_inds) random_inds = np.random.choice(np.arange(n_train_inds),int(n_train_inds/subsample),replace=False) sampler=torch_data.SubsetRandomSampler(random_inds) params_t = {'batch_size': batch_size, 'shuffle':False, 'num_workers': cpu_cores, 'sampler': sampler} params_v = {'batch_size': test_batch_size, 'shuffle':True, 'num_workers': cpu_cores} params_e = {'batch_size': em_batch_size, 'shuffle':True, 'num_workers': cpu_cores} n_snapshots = len(train_inds) + len(test_inds) training_set = Dataset(train_inds, labels, data) training_generator = torch_data.DataLoader(training_set, **params_t) validation_set = Dataset(test_inds, labels, data) validation_generator = torch_data.DataLoader(validation_set, **params_v) em_set = Dataset(train_inds, labels, data) em_generator = torch_data.DataLoader(em_set, **params_e) return training_generator, validation_generator, em_generator
def load_bedrooms(root_dir=None, batch_size=20, shuffle=True, transform=None): if root_dir is None: root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets' root_dir = str(root_dir) if transform is None: transform = transforms.ToTensor() train_dataset = datasets.LSUN(root_dir, classes=['classroom_train'], transform=transform) size_train = len(train_dataset) indices = list(range(size_train)) split = int(np.floor(0.2 * size_train)) if shuffle: np.random.shuffle(indices) train_idx, valid_idx = indices[split:], indices[:split] train_sampler = data_utils.SubsetRandomSampler(train_idx) valid_sampler = data_utils.SubsetRandomSampler(valid_idx) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, shuffle=shuffle) valid_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=valid_sampler, shuffle=shuffle) test_loader = torch.utils.data.DataLoader(datasets.MNIST( root_dir, train=False, transform=transforms.Compose([transforms.ToTensor(), transform])), batch_size=batch_size, shuffle=shuffle) return train_loader, test_loader, valid_loader
def load_CelebA(root_dir=None, batch_size=50, shuffle=True): dataset_type = "continuous" drop_to_make_batch_size_work = 99 if root_dir is None: root_dir = pathlib.Path(sys.argv[0]).parents[0] / 'datasets' dataset = np.load(root_dir / 'CelebA' / 'celebA.npy') idx = np.arange(len(dataset)) if shuffle: np.random.shuffle(idx) idx = idx[:-drop_to_make_batch_size_work] train_size = int(len(idx) * 0.1) idx_train, idx_test = idx[:train_size], idx[train_size:] val_size = int(len(idx_train) * 0.1) idx_train, idx_valid = idx[:-val_size], idx[-val_size:] sampler_train = data_utils.SubsetRandomSampler(idx_train) sampler_test = data_utils.SubsetRandomSampler(idx_test) sampler_valid = data_utils.SubsetRandomSampler(idx_valid) loader_train = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_train, shuffle=shuffle) loader_valid = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_valid, shuffle=shuffle) loader_test = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=sampler_test, shuffle=shuffle) return loader_train, loader_valid, loader_test, dataset_type
def mean_teacher( dataset_root, supervised_ratio: float = 0.1, batch_size: int = 128, train_transform: Module = None, val_transform: Module = None, **kwargs) -> Tuple[DataLoader, DataLoader]: """ Load the SpeechCommand for a student teacher learning """ loader_args = dict( num_workers=kwargs.get("num_workers", 0), pin_memory=kwargs.get("pin_memory", False), ) dataset_path = os.path.join(dataset_root) # validation subset val_dataset = SpeechCommands(root=dataset_path, subset="validation", transform=train_transform, download=True) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, **loader_args) # Training subset train_dataset = SpeechCommands(root=dataset_path, subset="train", transform=val_transform, download=True) s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio) nb_s_file = len(s_idx) nb_u_file = len(u_idx) s_batch_size = int(np.floor(batch_size * supervised_ratio)) u_batch_size = int(np.ceil(batch_size * (1 - supervised_ratio))) sampler_s = torch_data.SubsetRandomSampler(s_idx) sampler_u = torch_data.SubsetRandomSampler(u_idx) train_s_loader = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s) train_u_loader = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u) train_loader = ZipCycle([train_s_loader, train_u_loader]) return None, train_loader, val_loader
def create_data_loader(config, idxs, shuffle=True): if shuffle: sampler = TD.SubsetRandomSampler(idxs) dataset = config.dataset else: sampler = TD.SequentialSampler(idxs) dataset = TD.Subset(config.dataset, idxs) return TD.DataLoader( dataset, batch_size=config.batch_size, sampler=sampler, pin_memory=True, num_workers=config.data_loader_num_workers )
def student_teacher( dataset_root, supervised_ratio: float = 1.0, batch_size: int = 128, train_transform: list = [], val_transform: list = [], **kwargs ): """ Load the cifar10 dataset for student teacher framework. """ pass # Prepare the default dataset train_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=True, download=True, transform=train_transform) val_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=False, download=True, transform=val_transform) # Split the training dataset into a supervised and unsupervised sets s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio) # Calc the size of the supervised and unsupervised batch nb_s_file = len(s_idx) nb_u_file = len(u_idx) ratio = nb_s_file / nb_u_file s_batch_size = int(np.floor(batch_size * ratio)) u_batch_size = int(np.ceil(batch_size * (1 - ratio))) # Create the sample, the loader and zip them sampler_s = torch_data.SubsetRandomSampler(s_idx) sampler_u = torch_data.SubsetRandomSampler(u_idx) train_loader_s = torch_data.DataLoader(train_dataset, batch_size=s_batch_size, sampler=sampler_s) train_loader_u = torch_data.DataLoader(train_dataset, batch_size=u_batch_size, sampler=sampler_u) train_loader = ZipCycle([train_loader_s, train_loader_u]) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True) return None, train_loader, val_loader
def cross_validation_split(data_set, sample_size, val_split, shuffle=True): random_seed = 42 indices = list(range(sample_size)) split = int(np.floor(val_split * sample_size)) if shuffle: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, valid_indices = indices[split:], indices[:split] train_sampler = data.SubsetRandomSampler(train_indices) valid_sampler = data.SubsetRandomSampler(valid_indices) t_loader = data.DataLoader(data_set, batch_size=batch_size, sampler=train_sampler) v_loader = data.DataLoader(data_set, batch_size=batch_size, sampler=valid_sampler) return t_loader, v_loader
def load_data(cifar=True): """ load data and dataloader for pytorch SVHN or CIFAR """ transform_test = transforms.Compose([transforms.ToTensor()]) if cifar: testset = dset.CIFAR10(root='./data/cifar-10-batches-py/', train=False, download=True, transform=transform_test) else: testset = dset.SVHN(root='../data/', split='test', download=True, transform=transform_test) testloader = data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=2, sampler=data.SubsetRandomSampler(range(8500, 10000))) return testloader, testset
def supervised( dataset_root, supervised_ratio: float = 1.0, batch_size: int = 64, train_folds: tuple = (1, 2, 3, 4, 5, 6, 7, 8, 9), val_folds: tuple = (10, ), verbose=1, **kwargs, ): """ Load the UrbanSound dataset for supervised systems. """ audio_root = os.path.join(dataset_root, "UrbanSound8K", "audio") metadata_root = os.path.join(dataset_root, "UrbanSound8K", "metadata") all_folds = train_folds + val_folds # Create the dataset manager manager = DatasetManager(metadata_root, audio_root, folds=all_folds, verbose=verbose) # validation subset val_dataset = Dataset(manager, folds=val_folds, cached=True) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True) # training subset train_dataset = Dataset(manager, folds=train_folds, cached=True) if supervised_ratio == 1.0: train_loader = torch_data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) else: s_idx, u_idx = train_dataset.split_s_u(supervised_ratio) # Train loader only use the s_idx sampler_s = torch_data.SubsetRandomSampler(s_idx) train_loader = torch_data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler_s) return manager, train_loader, val_loader
def _split_sampler(self, split): if split == 0.0: return None, None idx_full = np.arange(self.n_samples) np.random.seed(self.seed) np.random.shuffle(idx_full) if isinstance(split, int): assert split > 0 assert split < self.n_samples, "validation set size is configured to be larger than entire dataset." len_valid = split else: len_valid = int(self.n_samples * split) ################## # 制定了测试文件(valtest)就将其作为验证集,否则将训练集分拆出测试集 if self.val_file: valid_idx = self.valid_idx train_idx = np.array([idx for idx in idx_full if idx not in valid_idx]) else: valid_idx = idx_full[0:len_valid] train_idx = np.delete(idx_full, np.arange(0, len_valid)) ####################### weights_per_class = 1. / torch.tensor(self.emotion_nums, dtype=torch.float) weights = [0] * self.n_samples for idx in range(self.n_samples): if idx in valid_idx: weights[idx] = 0. else: label = self.dataset[idx][0] weights[idx] = weights_per_class[label] weights = torch.tensor(weights) train_sampler = data.WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True) valid_sampler = data.SubsetRandomSampler(valid_idx) # turn off shuffle option which is mutually exclusive with sampler self.shuffle = False self.n_samples = len(train_idx) return train_sampler, valid_sampler
def setup_data(self): # Initialize trainset dataroot = self.opts.dataroot _trainset = dset.Dataset(dataroot, 'train', pairs='annotated') self.trainloader = dat.DataLoader(_trainset, batch_size=self.opts.batch_size, shuffle=True, num_workers=4) # Use subset of train data if self.opts.train_size: # if --N: override the __len__ method of the dataset so that only the first N items will be used def train_size(unused): return self.opts.train_size _trainset.__class__.__len__ = train_size # Initialize testset if self.opts.do_validation: # Defatult True _testset = dset.Dataset(dataroot, 'test', pairs='annotated') if self.opts.split_zeroshot: # Split testset into seen and zeroshot sets test_sets = zeroshot.Splitter(_trainset, _testset).split() self.testloaders = [ dat.DataLoader(data, batch_size=len(data), num_workers=NUM_WORKERS) for data in test_sets ] else: # Use a single (unified) testset testdata = dat.DataLoader(_testset, batch_size=len(_testset), num_workers=NUM_WORKERS) self.testloaders = [testdata] if self.opts.val: # Use only x percent of the primary testset as validation (and don't use the rest at this time) dataset = self.testloaders[0].dataset n = int(len(dataset) * self.opts.val) sampler = dat.SubsetRandomSampler(torch.arange(n)) self.testloaders[0] = dat.DataLoader(dataset, batch_size=n, sampler=sampler, num_workers=NUM_WORKERS) else: # if --noval self.testloaders = []
def load_supervised( dataset_root, supervised_ratio: float = 1.0, batch_size: int = 128, train_transform: list = [], val_transform: list = [], **kwargs ): """ Load the cifar10 dataset for Deep Co Training system. """ train_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=True, download=True, transform=train_transform) val_dataset = torchvision.datasets.CIFAR10(root=os.path.join(dataset_root, "CIFAR10"), train=False, download=True, transform=val_transform) # Split the training dataset into a supervised and unsupervised sets s_idx, u_idx = _split_s_u(train_dataset, supervised_ratio) sampler_s1 = torch_data.SubsetRandomSampler(s_idx) train_loader = torch_data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler_s1, num_workers=4, pin_memory=True, ) val_loader = torch_data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, ) return None, train_loader, val_loader