def random_split( dataset: Dataset[T], lengths: Sequence[int], generator: Optional[Generator] = default_generator) -> List[Subset[T]]: r""" Randomly split a dataset into non-overlapping new datasets of given lengths. Optionally fix the generator for reproducible results, e.g.: >>> random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42)) Args: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced generator (Generator): Generator used for the random permutation. """ # Cannot verify that dataset is Sized if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = randperm(sum(lengths), generator=generator).tolist() return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None): """Scatters tensor across multiple GPUs. Arguments: tensor (Tensor): tensor to scatter. devices (Iterable[int]): iterable of ints, specifying among which devices the tensor should be scattered. chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on each device. It should match ``devices`` in length and sum to ``tensor.size(dim)``. If not specified, the tensor will be divided into equal chunks. dim (int, optional): A dimension along which to chunk the tensor. Returns: A tuple containing chunks of the ``tensor``, spread across given ``devices``. """ if chunk_sizes is None: chunks = tensor.chunk(len(devices), dim) else: assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \ "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ "expected {})".format(sum(chunk_sizes), tensor.size(dim)) assert min(chunk_sizes) > 0, "got a negative chunk_size" chunks = [tensor.narrow(dim, start - size, size) for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)] chunks = tuple(chunk.contiguous() for chunk in chunks) # TODO: copy to a pinned buffer first (if copying from CPU) if streams is None: streams = [None] * len(devices) outputs = [] for device, chunk, stream in zip(devices, chunks, streams): with torch.cuda.device(device), torch.cuda.stream(stream): outputs.append(chunk.cuda(device, non_blocking=True)) return tuple(outputs)
def get_loader(self, force_update=False): settings = self.get_setting() if self.split_data: dset_sizes = [len(get_dataset(**s['data'])) for s in settings] assert len(set(dset_sizes)) == 1, \ "all datasets should be same size" dset_size = dset_sizes[0] lengths = [int(prob * dset_size) for prob in self.probs] lengths[-1] = dset_size - sum(lengths[:-1]) indices = torch.randperm(dset_size).tolist() indices_split = [ indices[offset - length:offset] for offset, length in zip(_accumulate(lengths), lengths) ] loaders = [ data_regime.get_loader(force_update=True, subset_indices=indices_split[i]) for i, data_regime in enumerate(self.data_regime_list) ] else: loaders = [ data_regime.get_loader(force_update=force_update) for data_regime in self.data_regime_list ] self._loader = SampledDataLoader(loaders) self._loader.epoch = self.epoch return self._loader
def dataset_split(dataset=None, lengths=None, indices=None): """ Split a dataset into non-overlapping new datasets of given lengths. If indices is undefined, then a random permutation of dataset is generated. Slight modification of torch.utils.data.random_split to gain access to permuted indices. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced indices (tensor): permutations of instances Returns: indices (tensor): premutations of instances """ if sum(lengths) != len(dataset): raise ValueError('Sum of input lengths does not equal the length of \ the input dataset!') # If requested a random split of dataset if indices is None: indices = randperm(sum(lengths)) indices = (indices).long() return indices, [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None): """Scatters tensor across multiple GPUs. Arguments: tensor (Tensor): tensor to scatter. devices (Iterable[int]): iterable of ints, specifying among which devices the tensor should be scattered. chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on each device. It should match ``devices`` in length and sum to ``tensor.size(dim)``. If not specified, the tensor will be divided into equal chunks. dim (int, optional): A dimension along which to chunk the tensor. Returns: A tuple containing chunks of the ``tensor``, spread accross given ``devices``. """ if chunk_sizes is None: chunks = tensor.chunk(len(devices), dim) else: assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \ "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ "expected {})".format(sum(chunk_sizes), tensor.size(dim)) assert min(chunk_sizes) > 0, "got a negative chunk_size" chunks = [tensor.narrow(dim, start - size, size) for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)] chunks = tuple(chunk.contiguous() for chunk in chunks) # TODO: copy to a pinned buffer first (if copying from CPU) if streams is None: streams = [None] * len(devices) outputs = [] for device, chunk, stream in zip(devices, chunks, streams): with torch.cuda.device(device), torch.cuda.stream(stream): outputs.append(chunk.cuda(device, async=True)) return tuple(outputs)
def __init__(self, opt): """ Modulate the data ratio in the batch. For example, when select_data is "MJ-ST" and batch_ratio is "0.5-0.5", the 50% of the batch is filled with MJ and the other 50% of the batch is filled with ST. """ print('-' * 80) print('dataset_root<', opt.train_data, '>') print('opt.select_data<', opt.select_data, '>') print('opt.batch_ratio<', opt.batch_ratio, '>') assert len(opt.select_data) == len(opt.batch_ratio) _AlignCollate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) self.data_loader_list = [] self.dataloader_iter_list = [] batch_size_list = [] Total_batch_size = 0 for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio): _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1) print('-' * 80) _dataset = hierarchical_dataset(root=opt.train_data, opt=opt, select_data=[selected_d]) total_number_dataset = len(_dataset) """ The total number of data can be modified with opt.total_data_usage_ratio. ex) opt.total_data_usage_ratio = 1 indicates 100% usage, and 0.2 indicates 20% usage. See 4.2 section in our paper. """ number_dataset = int(total_number_dataset * float(opt.total_data_usage_ratio)) dataset_split = [ number_dataset, total_number_dataset - number_dataset ] indices = range(total_number_dataset) _dataset, _ = [ Subset(_dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(dataset_split), dataset_split) ] batch_size_list.append(str(_batch_size)) Total_batch_size += _batch_size _data_loader = torch.utils.data.DataLoader( _dataset, batch_size=_batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=_AlignCollate, pin_memory=True) self.data_loader_list.append(_data_loader) self.dataloader_iter_list.append(iter(_data_loader)) print('-' * 80) print('Total_batch_size: ', '+'.join(batch_size_list), '=', str(Total_batch_size)) opt.batch_size = Total_batch_size print('-' * 80)
def __init__(self, opt): print('-' * 80) print( f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}' ) assert len(opt.select_data) == len(opt.batch_ratio) _AlignCollate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD) self.data_loader_list = [] self.dataloader_iter_list = [] batch_size_list = [] Total_batch_size = 0 for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio): _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1) print('-' * 80) _dataset = hierarchical_dataset(root=opt.train_data, opt=opt, select_data=[selected_d]) total_number_dataset = len(_dataset) number_dataset = int(total_number_dataset * float(opt.total_data_usage_ratio)) dataset_split = [ number_dataset, total_number_dataset - number_dataset ] indices = range(total_number_dataset) _dataset, _ = [ Subset(_dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(dataset_split), dataset_split) ] print( f'num total samples of {selected_d}: {total_number_dataset} x {opt.total_data_usage_ratio} (total_data_usage_ratio) = {len(_dataset)}' ) print( f'num samples of {selected_d} per batch: {opt.batch_size} x {float(batch_ratio_d)} (batch_ratio) = {_batch_size}' ) batch_size_list.append(str(_batch_size)) Total_batch_size += _batch_size _data_loader = torch.utils.data.DataLoader( _dataset, batch_size=_batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=_AlignCollate, pin_memory=True) self.data_loader_list.append(_data_loader) self.dataloader_iter_list.append(iter(_data_loader)) print('-' * 80) print('Total_batch_size: ', '+'.join(batch_size_list), '=', str(Total_batch_size)) opt.batch_size = Total_batch_size print('-' * 80)
def determ_split(dataset, lengths): if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) return [ Subset(dataset, np.arange(offset - length, offset)) for offset, length in zip(_accumulate(lengths), lengths) ]
def CxrRandomSplit(dataset, lengths): from torch._utils import _accumulate if sum(lengths) > len(dataset): raise ValueError( "Sum of input lengths must less or equal to the length of the input dataset!" ) indices = torch.randperm(sum(lengths)).tolist() return [ CxrSubset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def __init__(self, opt): """ Modulate the data ratio in the batch. For example, when select_data is "MJ-ST" and batch_ratio is "0.5-0.5", the 50% of the batch is filled with MJ and the other 50% of the batch is filled with ST. """ print('-' * 80) print( f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}' ) assert len(opt.select_data) == len(opt.batch_ratio) _AlignCollate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW) self.data_loader_list = [] self.dataloader_iter_list = [] for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio): _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1) print('-' * 80) _dataset = hierarchical_dataset(root=opt.train_data, opt=opt, select_data=[selected_d]) total_number_dataset = len(_dataset) """ The total number of data can be modified with opt.total_data_usage_ratio. ex) opt.total_data_usage_ratio = 1 indicates 100% usage, and 0.2 indicates 20% usage. See 4.2 section in our paper. """ number_dataset = int(total_number_dataset * float(opt.total_data_usage_ratio)) dataset_split = [ number_dataset, total_number_dataset - number_dataset ] indices = range(total_number_dataset) _dataset, _ = [ Subset(_dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(dataset_split), dataset_split) ] print( f'num total samples of {selected_d}: {total_number_dataset} x {opt.total_data_usage_ratio} (total_data_usage_ratio) = {len(_dataset)}' ) print( f'num samples of {selected_d} per batch: {opt.batch_size} x {float(batch_ratio_d)} (batch_ratio) = {_batch_size}' ) _data_loader = torch.utils.data.DataLoader( _dataset, batch_size=_batch_size, shuffle=True, num_workers=int(opt.workers), collate_fn=_AlignCollate, pin_memory=True) self.data_loader_list.append(_data_loader) self.dataloader_iter_list.append(iter(_data_loader)) print('-' * 80)
def random_split_dataset(dataset, lengths, seed=0): if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) r = np.random.RandomState(1234) r.seed(seed) indices = r.permutation(sum(lengths)).tolist() return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def split(self, lengths): """ split a dataset into non-overlapping new datasets of given lengths. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(self): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!") return [self.select_range(offset-length, offset) for offset, length in zip(_accumulate(lengths), lengths)]
def sequence_split(lengths: List[int]) -> List[Dataset]: """ 对应于 torch.utils.data.dataset.random_split ,用于按照长度顺序切分数据集 Args: lengths: Returns: """ indices = torch.arange(0, sum(lengths)).tolist() return [indices[offset - length:offset] for offset, length in zip(_accumulate(lengths), lengths)]
def deterministic_split_dataset(ds, split_rate): """ Split data consistently into train/val datasets""" size = len(ds) train_split = int(size * split_rate) val_split = size - train_split lengths = [train_split, val_split] indices = sum(lengths).tolist() return [ Subset(ds, indices[offset - length : offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def divide_datasets(dataset_list, eachusage): temp_datasets = [] for idx, dl in enumerate(dataset_list): usage = eachusage[idx] total_num = len(dl) number_dataset = int(total_num * float(usage)) dataset_split = [number_dataset, total_num - number_dataset] indices = range(total_num) dl, _ = [ Subset(dl, indices[offset - length:offset]) for offset, length in zip(_accumulate(dataset_split), dataset_split) ] temp_datasets.append(dl) return temp_datasets
def _create_module_dir(fullname): module, _, name = fullname.rpartition('.') if not module: target_dir = name else: target_dir = reduce(os.path.join, fullname.split('.')) try: os.makedirs(target_dir) except os.error: pass for dirname in _accumulate(fullname.split('.'), os.path.join): init_file = os.path.join(dirname, '__init__.py') open(init_file, 'a').close() # Create file if it doesn't exist yet return name, target_dir
def random_split(dataset, lengths): r""" Randomly split a dataset into non-overlapping new datasets of given lengths. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) > len(dataset): raise ValueError("Sum of input lengths is greater than the length of the input dataset!") indices = randperm(sum(lengths)).tolist() return [torch.utils.data.Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]
def get_splits(dataset, split_portions=(0.8, 0.1, 0.1), seed=123): """Return a list of Subsets given a dataset. Use a fixed seed (and split portions) to ensure the splits are always the same across every run. """ dataset_len = len(dataset) lengths = get_split_sizes(dataset_len, split_portions) # Get the size of each split # Create a permutation rand = random.Random(seed) perm = range(dataset_len) rand.shuffle(perm) # Shuffle in place assert sum(lengths) == dataset_len return [Subset(dataset, perm[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]
def kfold_split(dataset, fold=5, seed=0): total_len = dataset.__len__() lengths = [] for _ in range(fold - 1): lengths.append(int(total_len / fold)) lengths.append(total_len - (fold - 1) * int(total_len / fold)) r = np.random.RandomState(1234) r.seed(seed) indices = r.permutation(total_len).tolist() return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def _create_module_dir(base_path, fullname): module, _, name = fullname.rpartition('.') if not module: target_dir = name else: target_dir = reduce(os.path.join, fullname.split('.')) target_dir = os.path.join(base_path, target_dir) try: os.makedirs(target_dir) except os.error: pass for dirname in _accumulate(fullname.split('.'), os.path.join): init_file = os.path.join(base_path, dirname, '__init__.py') open(init_file, 'a').close() # Create file if it doesn't exist yet return name, target_dir
def scatter(tensor, devices, chunk_sizes=None, dim=0): "Scatters tensor across multiple GPUs" if chunk_sizes is None: chunks = tensor.chunk(len(devices), dim) else: assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \ "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ "expected {})".format(sum(chunk_sizes), tensor.size(dim)) assert min(chunk_sizes) > 0, "got a negative chunk_size" chunks = [ tensor.narrow(dim, start - size, size) for start, size in zip(_accumulate(chunk_sizes), chunk_sizes) ] # TODO: copy to a pinned buffer first (if copying from CPU) return tuple( chunk.cuda(gpu_id, async=chunk.is_contiguous()) for gpu_id, chunk in zip(devices, chunks))
def random_split(self, lengths): """ Randomly split a dataset into non-overlapping new datasets of given lengths. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(self): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = torch.randperm(sum(lengths)).tolist() return [ self.subset(indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def split_with_indices(dataset, lengths, indices): r""" Randomly split a dataset into non-overlapping new nddatasets of given lengths. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ], indices
def random_split(dataset, lengths): """ Randomly split a dataset into non-overlapping new datasets of given lengths ds Arguments: dataset (Dataset): Dataset to be split lengths (iterable): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = randperm(sum(lengths)) return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def scatter_list(list_of_tensors, devices, chunk_sizes=None, streams=None): """Scatters tensor across multiple GPUs. Arguments: list_of_tensors (list(Tensor)): list of tensors to scatter. devices (Iterable[int]): iterable of ints, specifying among which devices the tensor should be scattered. chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on each device. It should match ``devices`` in length and sum to ``len(list_of_tensors)``. If not specified, the list of tensors will be divided into equal chunks. Returns: A tuple containing chunks of the ``list of tensors``, spread across given ``devices``. """ if chunk_sizes is None: chunks = chunk_list(list_of_tensors, len(devices)) else: assert sum(chunk_sizes) == len(list_of_tensors), "given chunk sizes " \ "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \ "expected {})".format(sum(chunk_sizes), len(list_of_tensors)) assert min(chunk_sizes) > 0, "got a negative chunk_size" # chunks = [list_of_tensors.narrow(dim, start - size, size) # for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)] chunks = [ list_of_tensors[start:start + size] for start, size in zip(_accumulate(chunk_sizes), chunk_sizes) ] # chunks = tuple(chunk.contiguous() for chunk in chunks) # TODO: copy to a pinned buffer first (if copying from CPU) if streams is None: streams = [None] * len(devices) outputs = [] for device, chunk, stream in zip(devices, chunks, streams): with torch.cuda.device(device), torch.cuda.stream(stream): # outputs.append(chunk.cuda(device, non_blocking=True)) outputs.append( Utils.move_tensor_list_to_device(chunk, device, non_blocking=True)) return tuple(outputs)
def random_split_i(dataset, lengths): r""" Randomly split a dataset into non-overlapping new datasets of given lengths. Adapted from source, but uses SubsetI to return indices https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#random_split Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = torch.randperm(sum(lengths)).tolist() return [ SubsetI(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def load_data(self, IID=True): self.trainset = datasets.MNIST( self.path, train=True, download=True, transform=transforms.Compose([ transforms.RandomRotation(15), transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) self.testset = datasets.MNIST( self.path, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) total_clients = self.config.clients.total total_sample = self.trainset.data.shape[0] # number of samples on each client length = [total_sample // total_clients] * total_clients if IID: spilted_train = random_split(self.trainset, length) else: print("None-IID") if sum(length) != len(self.trainset): raise ValueError("Sum of input lengths does not equal the length of the input dataset!") index = [] for i in range(10): index.append([]) i = 0 for img, label in self.trainset: index[label].append(i) i += 1 indices = np.array([elem for c_list in index for elem in c_list]).reshape(-1, 200) np.random.shuffle(indices) indices = indices.flatten() print(indices.shape) spilted_train = [Subset(self.trainset, indices[offset - length:offset]) for offset, length in zip(_accumulate(length), length)] print(len(spilted_train)) return spilted_train, self.testset
def sequential_split(dataset, lengths): r""" Sequentially split a dataset into non-overlapping new datasets of given lengths. >>> sequential_split(range(10), [3, 7]) Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = list(range(sum(lengths))) return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def random_split_before_transform(dataset, lengths, transforms): r""" Randomly split a dataset into non-overlapping new datasets of given lengths before transforming the input. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced transforms: transformation to apply to data """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) indices = randperm(sum(lengths)).tolist() return [ TransformableSubset(dataset, indices[offset - length:offset], transform) for offset, length, transform in zip( _accumulate(lengths), lengths, transforms) ]
def random_split(dataset, valid_split): r""" Randomly split a dataset into non-overlapping new datasets of given valid_split percentage. Arguments: dataset (Dataset): Dataset to be split valid_split (sequence): percentage of validation data to be produced """ assert 0 <= valid_split < 1 valid_len = int(len(dataset) * valid_split) if valid_len == 0: return dataset, None train_len = len(dataset) - valid_len lengths = [train_len, valid_len] indices = randperm(sum(lengths)).tolist() return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def split_dataset(dataset, lengths=(80, 20), random=True): """ split a dataset into non-overlapping new datasets of given lengths. Arguments: dataset (Dataset): Dataset to be split lengths (sequence): lengths of splits to be produced """ if sum(lengths) != len(dataset): # lengths are proportions mult = len(dataset) / sum(lengths) lengths = [round(l * mult) for l in lengths] if not random: indices = torch.arange(0, sum(lengths)).long() # else: indices = torch.randperm(sum(lengths)) return [ torch.utils.data.Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def fixed_random_split(dataset, lengths): """Randomly split a dataset into non-overlapping new datasets of given lengths. The seed of this random function is always 42. Args: dataset (torch Dataset): dataset to be split. lengths (list(int)): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError( "Sum of input lengths does not equal the length of the input dataset!" ) torch.manual_seed(42) indices = randperm(sum(lengths)) torch.manual_seed(datetime.datetime.now().timestamp()) return [ Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths) ]
def random_split(dataset, lengths): """ Randomly split a dataset into non-overlapping new datasets of given lengths ds Arguments: dataset (Dataset): Dataset to be split lengths (iterable): lengths of splits to be produced """ if sum(lengths) != len(dataset): raise ValueError("Sum of input lengths does not equal the length of the input dataset!") indices = randperm(sum(lengths)) return [Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]
def backward(ctx, grad_output): return (None,) + tuple(grad_output.narrow(ctx.dim, end - size, size) for size, end in zip(ctx.input_sizes, _accumulate(ctx.input_sizes)))