Ejemplo n.º 1
0
def random_split(
        dataset: Dataset[T],
        lengths: Sequence[int],
        generator: Optional[Generator] = default_generator) -> List[Subset[T]]:
    r"""
    Randomly split a dataset into non-overlapping new datasets of given lengths.
    Optionally fix the generator for reproducible results, e.g.:

    >>> random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))

    Args:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
        generator (Generator): Generator used for the random permutation.
    """
    # Cannot verify that dataset is Sized
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    indices = randperm(sum(lengths), generator=generator).tolist()
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 2
0
def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None):
    """Scatters tensor across multiple GPUs.

    Arguments:
        tensor (Tensor): tensor to scatter.
        devices (Iterable[int]): iterable of ints, specifying among which
            devices the tensor should be scattered.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
            each device. It should match ``devices`` in length and sum to
            ``tensor.size(dim)``. If not specified, the tensor will be divided
            into equal chunks.
        dim (int, optional): A dimension along which to chunk the tensor.

    Returns:
        A tuple containing chunks of the ``tensor``, spread across given
        ``devices``.
    """
    if chunk_sizes is None:
        chunks = tensor.chunk(len(devices), dim)
    else:
        assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \
            "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \
            "expected {})".format(sum(chunk_sizes), tensor.size(dim))
        assert min(chunk_sizes) > 0, "got a negative chunk_size"
        chunks = [tensor.narrow(dim, start - size, size)
                  for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)]
    chunks = tuple(chunk.contiguous() for chunk in chunks)
    # TODO: copy to a pinned buffer first (if copying from CPU)
    if streams is None:
        streams = [None] * len(devices)
    outputs = []
    for device, chunk, stream in zip(devices, chunks, streams):
        with torch.cuda.device(device), torch.cuda.stream(stream):
            outputs.append(chunk.cuda(device, non_blocking=True))
    return tuple(outputs)
Ejemplo n.º 3
0
    def get_loader(self, force_update=False):
        settings = self.get_setting()
        if self.split_data:
            dset_sizes = [len(get_dataset(**s['data'])) for s in settings]
            assert len(set(dset_sizes)) == 1, \
                "all datasets should be same size"
            dset_size = dset_sizes[0]
            lengths = [int(prob * dset_size) for prob in self.probs]
            lengths[-1] = dset_size - sum(lengths[:-1])
            indices = torch.randperm(dset_size).tolist()
            indices_split = [
                indices[offset - length:offset]
                for offset, length in zip(_accumulate(lengths), lengths)
            ]
            loaders = [
                data_regime.get_loader(force_update=True,
                                       subset_indices=indices_split[i])
                for i, data_regime in enumerate(self.data_regime_list)
            ]
        else:
            loaders = [
                data_regime.get_loader(force_update=force_update)
                for data_regime in self.data_regime_list
            ]
        self._loader = SampledDataLoader(loaders)
        self._loader.epoch = self.epoch

        return self._loader
Ejemplo n.º 4
0
def dataset_split(dataset=None, lengths=None, indices=None):
    """
    Split a dataset into non-overlapping new datasets of given lengths.
    If indices is undefined, then a random permutation of dataset
    is generated. Slight modification of torch.utils.data.random_split
    to gain access to permuted indices.

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
        indices (tensor): permutations of instances

    Returns:
        indices (tensor): premutations of instances

    """
    if sum(lengths) != len(dataset):
        raise ValueError('Sum of input lengths does not equal the length of \
        the input dataset!')

    # If requested a random split of dataset
    if indices is None:
        indices = randperm(sum(lengths))

    indices = (indices).long()

    return indices, [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 5
0
def scatter(tensor, devices, chunk_sizes=None, dim=0, streams=None):
    """Scatters tensor across multiple GPUs.

    Arguments:
        tensor (Tensor): tensor to scatter.
        devices (Iterable[int]): iterable of ints, specifying among which
            devices the tensor should be scattered.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
            each device. It should match ``devices`` in length and sum to
            ``tensor.size(dim)``. If not specified, the tensor will be divided
            into equal chunks.
        dim (int, optional): A dimension along which to chunk the tensor.

    Returns:
        A tuple containing chunks of the ``tensor``, spread accross given
        ``devices``.
    """
    if chunk_sizes is None:
        chunks = tensor.chunk(len(devices), dim)
    else:
        assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \
            "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \
            "expected {})".format(sum(chunk_sizes), tensor.size(dim))
        assert min(chunk_sizes) > 0, "got a negative chunk_size"
        chunks = [tensor.narrow(dim, start - size, size)
                  for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)]
    chunks = tuple(chunk.contiguous() for chunk in chunks)
    # TODO: copy to a pinned buffer first (if copying from CPU)
    if streams is None:
        streams = [None] * len(devices)
    outputs = []
    for device, chunk, stream in zip(devices, chunks, streams):
        with torch.cuda.device(device), torch.cuda.stream(stream):
            outputs.append(chunk.cuda(device, async=True))
    return tuple(outputs)
    def __init__(self, opt):
        """
        Modulate the data ratio in the batch.
        For example, when select_data is "MJ-ST" and batch_ratio is "0.5-0.5",
        the 50% of the batch is filled with MJ and the other 50% of the batch is filled with ST.
        """
        print('-' * 80)
        print('dataset_root<', opt.train_data, '>')
        print('opt.select_data<', opt.select_data, '>')
        print('opt.batch_ratio<', opt.batch_ratio, '>')

        assert len(opt.select_data) == len(opt.batch_ratio)

        _AlignCollate = AlignCollate(imgH=opt.imgH,
                                     imgW=opt.imgW,
                                     keep_ratio_with_pad=opt.PAD)
        self.data_loader_list = []
        self.dataloader_iter_list = []
        batch_size_list = []
        Total_batch_size = 0
        for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio):
            _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1)
            print('-' * 80)
            _dataset = hierarchical_dataset(root=opt.train_data,
                                            opt=opt,
                                            select_data=[selected_d])
            total_number_dataset = len(_dataset)
            """
            The total number of data can be modified with opt.total_data_usage_ratio.
            ex) opt.total_data_usage_ratio = 1 indicates 100% usage, and 0.2 indicates 20% usage.
            See 4.2 section in our paper.
            """
            number_dataset = int(total_number_dataset *
                                 float(opt.total_data_usage_ratio))
            dataset_split = [
                number_dataset, total_number_dataset - number_dataset
            ]
            indices = range(total_number_dataset)
            _dataset, _ = [
                Subset(_dataset, indices[offset - length:offset]) for offset,
                length in zip(_accumulate(dataset_split), dataset_split)
            ]
            batch_size_list.append(str(_batch_size))
            Total_batch_size += _batch_size

            _data_loader = torch.utils.data.DataLoader(
                _dataset,
                batch_size=_batch_size,
                shuffle=True,
                num_workers=int(opt.workers),
                collate_fn=_AlignCollate,
                pin_memory=True)
            self.data_loader_list.append(_data_loader)
            self.dataloader_iter_list.append(iter(_data_loader))
        print('-' * 80)
        print('Total_batch_size: ', '+'.join(batch_size_list), '=',
              str(Total_batch_size))
        opt.batch_size = Total_batch_size
        print('-' * 80)
    def __init__(self, opt):

        print('-' * 80)
        print(
            f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}'
        )
        assert len(opt.select_data) == len(opt.batch_ratio)

        _AlignCollate = AlignCollate(imgH=opt.imgH,
                                     imgW=opt.imgW,
                                     keep_ratio_with_pad=opt.PAD)
        self.data_loader_list = []
        self.dataloader_iter_list = []
        batch_size_list = []
        Total_batch_size = 0
        for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio):
            _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1)
            print('-' * 80)
            _dataset = hierarchical_dataset(root=opt.train_data,
                                            opt=opt,
                                            select_data=[selected_d])
            total_number_dataset = len(_dataset)

            number_dataset = int(total_number_dataset *
                                 float(opt.total_data_usage_ratio))
            dataset_split = [
                number_dataset, total_number_dataset - number_dataset
            ]
            indices = range(total_number_dataset)
            _dataset, _ = [
                Subset(_dataset, indices[offset - length:offset]) for offset,
                length in zip(_accumulate(dataset_split), dataset_split)
            ]
            print(
                f'num total samples of {selected_d}: {total_number_dataset} x {opt.total_data_usage_ratio} (total_data_usage_ratio) = {len(_dataset)}'
            )
            print(
                f'num samples of {selected_d} per batch: {opt.batch_size} x {float(batch_ratio_d)} (batch_ratio) = {_batch_size}'
            )
            batch_size_list.append(str(_batch_size))
            Total_batch_size += _batch_size

            _data_loader = torch.utils.data.DataLoader(
                _dataset,
                batch_size=_batch_size,
                shuffle=True,
                num_workers=int(opt.workers),
                collate_fn=_AlignCollate,
                pin_memory=True)
            self.data_loader_list.append(_data_loader)
            self.dataloader_iter_list.append(iter(_data_loader))
        print('-' * 80)
        print('Total_batch_size: ', '+'.join(batch_size_list), '=',
              str(Total_batch_size))
        opt.batch_size = Total_batch_size
        print('-' * 80)
Ejemplo n.º 8
0
def determ_split(dataset, lengths):
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    return [
        Subset(dataset, np.arange(offset - length, offset))
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 9
0
def CxrRandomSplit(dataset, lengths):
    from torch._utils import _accumulate
    if sum(lengths) > len(dataset):
        raise ValueError(
            "Sum of input lengths must less or equal to the length of the input dataset!"
        )
    indices = torch.randperm(sum(lengths)).tolist()
    return [
        CxrSubset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
    def __init__(self, opt):
        """
        Modulate the data ratio in the batch.
        For example, when select_data is "MJ-ST" and batch_ratio is "0.5-0.5",
        the 50% of the batch is filled with MJ and the other 50% of the batch is filled with ST.
        """
        print('-' * 80)
        print(
            f'dataset_root: {opt.train_data}\nopt.select_data: {opt.select_data}\nopt.batch_ratio: {opt.batch_ratio}'
        )
        assert len(opt.select_data) == len(opt.batch_ratio)

        _AlignCollate = AlignCollate(imgH=opt.imgH, imgW=opt.imgW)
        self.data_loader_list = []
        self.dataloader_iter_list = []
        for selected_d, batch_ratio_d in zip(opt.select_data, opt.batch_ratio):
            _batch_size = max(round(opt.batch_size * float(batch_ratio_d)), 1)
            print('-' * 80)
            _dataset = hierarchical_dataset(root=opt.train_data,
                                            opt=opt,
                                            select_data=[selected_d])
            total_number_dataset = len(_dataset)
            """
            The total number of data can be modified with opt.total_data_usage_ratio.
            ex) opt.total_data_usage_ratio = 1 indicates 100% usage, and 0.2 indicates 20% usage.
            See 4.2 section in our paper.
            """
            number_dataset = int(total_number_dataset *
                                 float(opt.total_data_usage_ratio))
            dataset_split = [
                number_dataset, total_number_dataset - number_dataset
            ]
            indices = range(total_number_dataset)
            _dataset, _ = [
                Subset(_dataset, indices[offset - length:offset]) for offset,
                length in zip(_accumulate(dataset_split), dataset_split)
            ]
            print(
                f'num total samples of {selected_d}: {total_number_dataset} x {opt.total_data_usage_ratio} (total_data_usage_ratio) = {len(_dataset)}'
            )
            print(
                f'num samples of {selected_d} per batch: {opt.batch_size} x {float(batch_ratio_d)} (batch_ratio) = {_batch_size}'
            )

            _data_loader = torch.utils.data.DataLoader(
                _dataset,
                batch_size=_batch_size,
                shuffle=True,
                num_workers=int(opt.workers),
                collate_fn=_AlignCollate,
                pin_memory=True)
            self.data_loader_list.append(_data_loader)
            self.dataloader_iter_list.append(iter(_data_loader))
        print('-' * 80)
Ejemplo n.º 11
0
def random_split_dataset(dataset, lengths, seed=0):
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )
    r = np.random.RandomState(1234)
    r.seed(seed)
    indices = r.permutation(sum(lengths)).tolist()
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 12
0
    def split(self, lengths):
        """
        split a dataset into non-overlapping new datasets of given lengths.
        Arguments:
            dataset (Dataset): Dataset to be split
            lengths (sequence): lengths of splits to be produced
        """
        if sum(lengths) != len(self):
            raise ValueError(
                "Sum of input lengths does not equal the length of the input dataset!")

        return [self.select_range(offset-length, offset) for offset, length in zip(_accumulate(lengths), lengths)]
Ejemplo n.º 13
0
def sequence_split(lengths: List[int]) -> List[Dataset]:
    """
    对应于 torch.utils.data.dataset.random_split ,用于按照长度顺序切分数据集
    Args:
        lengths:

    Returns:

    """
    indices = torch.arange(0, sum(lengths)).tolist()
    return [indices[offset - length:offset] for offset, length in
            zip(_accumulate(lengths), lengths)]
Ejemplo n.º 14
0
def deterministic_split_dataset(ds, split_rate):
    """ Split data consistently into train/val datasets"""
    size = len(ds)
    train_split = int(size * split_rate)
    val_split = size - train_split

    lengths = [train_split, val_split]
    indices = sum(lengths).tolist()

    return [
        Subset(ds, indices[offset - length : offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 15
0
 def divide_datasets(dataset_list, eachusage):
     temp_datasets = []
     for idx, dl in enumerate(dataset_list):
         usage = eachusage[idx]
         total_num = len(dl)
         number_dataset = int(total_num * float(usage))
         dataset_split = [number_dataset, total_num - number_dataset]
         indices = range(total_num)
         dl, _ = [
             Subset(dl, indices[offset - length:offset]) for offset, length
             in zip(_accumulate(dataset_split), dataset_split)
         ]
         temp_datasets.append(dl)
     return temp_datasets
Ejemplo n.º 16
0
def _create_module_dir(fullname):
    module, _, name = fullname.rpartition('.')
    if not module:
        target_dir = name
    else:
        target_dir = reduce(os.path.join, fullname.split('.'))
    try:
        os.makedirs(target_dir)
    except os.error:
        pass
    for dirname in _accumulate(fullname.split('.'), os.path.join):
        init_file = os.path.join(dirname, '__init__.py')
        open(init_file, 'a').close()  # Create file if it doesn't exist yet
    return name, target_dir
Ejemplo n.º 17
0
def random_split(dataset, lengths):
    r"""
    Randomly split a dataset into non-overlapping new datasets of given lengths.

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
    """
    if sum(lengths) > len(dataset):
        raise ValueError("Sum of input lengths is greater than the length of the input dataset!")

    indices = randperm(sum(lengths)).tolist()
    return [torch.utils.data.Subset(dataset, indices[offset - length:offset]) for offset, length in
            zip(_accumulate(lengths), lengths)]
Ejemplo n.º 18
0
def get_splits(dataset, split_portions=(0.8, 0.1, 0.1), seed=123):
    """Return a list of Subsets given a dataset. Use a fixed seed (and split portions) to ensure the
    splits are always the same across every run.
    """
    dataset_len = len(dataset)
    lengths = get_split_sizes(dataset_len, split_portions)  # Get the size of each split

    # Create a permutation
    rand = random.Random(seed)
    perm = range(dataset_len)
    rand.shuffle(perm)  # Shuffle in place

    assert sum(lengths) == dataset_len
    return [Subset(dataset, perm[offset - length:offset])
            for offset, length in zip(_accumulate(lengths), lengths)]
Ejemplo n.º 19
0
def kfold_split(dataset, fold=5, seed=0):
    total_len = dataset.__len__()
    lengths = []
    for _ in range(fold - 1):
        lengths.append(int(total_len / fold))
    lengths.append(total_len - (fold - 1) * int(total_len / fold))

    r = np.random.RandomState(1234)
    r.seed(seed)

    indices = r.permutation(total_len).tolist()
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 20
0
def _create_module_dir(base_path, fullname):
    module, _, name = fullname.rpartition('.')
    if not module:
        target_dir = name
    else:
        target_dir = reduce(os.path.join, fullname.split('.'))
    target_dir = os.path.join(base_path, target_dir)
    try:
        os.makedirs(target_dir)
    except os.error:
        pass
    for dirname in _accumulate(fullname.split('.'), os.path.join):
        init_file = os.path.join(base_path, dirname, '__init__.py')
        open(init_file, 'a').close()  # Create file if it doesn't exist yet
    return name, target_dir
Ejemplo n.º 21
0
def scatter(tensor, devices, chunk_sizes=None, dim=0):
    "Scatters tensor across multiple GPUs"
    if chunk_sizes is None:
        chunks = tensor.chunk(len(devices), dim)
    else:
        assert sum(chunk_sizes) == tensor.size(dim), "given chunk sizes " \
            "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \
            "expected {})".format(sum(chunk_sizes), tensor.size(dim))
        assert min(chunk_sizes) > 0, "got a negative chunk_size"
        chunks = [
            tensor.narrow(dim, start - size, size)
            for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)
        ]
    # TODO: copy to a pinned buffer first (if copying from CPU)
    return tuple(
        chunk.cuda(gpu_id, async=chunk.is_contiguous())
        for gpu_id, chunk in zip(devices, chunks))
Ejemplo n.º 22
0
    def random_split(self, lengths):
        """
        Randomly split a dataset into non-overlapping new datasets of given lengths.
        Arguments:
            dataset (Dataset): Dataset to be split
            lengths (sequence): lengths of splits to be produced
        """
        if sum(lengths) != len(self):
            raise ValueError(
                "Sum of input lengths does not equal the length of the input dataset!"
            )

        indices = torch.randperm(sum(lengths)).tolist()
        return [
            self.subset(indices[offset - length:offset])
            for offset, length in zip(_accumulate(lengths), lengths)
        ]
Ejemplo n.º 23
0
def split_with_indices(dataset, lengths, indices):
    r"""
    Randomly split a dataset into non-overlapping new nddatasets of given lengths.

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ], indices
Ejemplo n.º 24
0
def random_split(dataset, lengths):
    """
    Randomly split a dataset into non-overlapping new datasets of given lengths
    ds
    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (iterable): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    indices = randperm(sum(lengths))
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 25
0
def scatter_list(list_of_tensors, devices, chunk_sizes=None, streams=None):
    """Scatters tensor across multiple GPUs.

    Arguments:
        list_of_tensors (list(Tensor)): list of tensors to scatter.
        devices (Iterable[int]): iterable of ints, specifying among which
            devices the tensor should be scattered.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
            each device. It should match ``devices`` in length and sum to
            ``len(list_of_tensors)``. If not specified, the
            list of tensors will be divided
            into equal chunks.

    Returns:
        A tuple containing chunks of the ``list of tensors``, spread across given
        ``devices``.
    """
    if chunk_sizes is None:
        chunks = chunk_list(list_of_tensors, len(devices))
    else:
        assert sum(chunk_sizes) == len(list_of_tensors), "given chunk sizes " \
            "don't sum up to the tensor's size (sum(chunk_sizes) == {}, but " \
            "expected {})".format(sum(chunk_sizes), len(list_of_tensors))
        assert min(chunk_sizes) > 0, "got a negative chunk_size"
        # chunks = [list_of_tensors.narrow(dim, start - size, size)
        #           for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)]
        chunks = [
            list_of_tensors[start:start + size]
            for start, size in zip(_accumulate(chunk_sizes), chunk_sizes)
        ]
    # chunks = tuple(chunk.contiguous() for chunk in chunks)
    # TODO: copy to a pinned buffer first (if copying from CPU)
    if streams is None:
        streams = [None] * len(devices)
    outputs = []
    for device, chunk, stream in zip(devices, chunks, streams):
        with torch.cuda.device(device), torch.cuda.stream(stream):
            # outputs.append(chunk.cuda(device, non_blocking=True))
            outputs.append(
                Utils.move_tensor_list_to_device(chunk,
                                                 device,
                                                 non_blocking=True))
    return tuple(outputs)
Ejemplo n.º 26
0
def random_split_i(dataset, lengths):
    r"""
    Randomly split a dataset into non-overlapping new datasets of given lengths.
    Adapted from source, but uses SubsetI to return indices
    https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#random_split
    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    indices = torch.randperm(sum(lengths)).tolist()
    return [
        SubsetI(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 27
0
    def load_data(self, IID=True):
        self.trainset = datasets.MNIST(
            self.path, train=True, download=True, transform=transforms.Compose([
                transforms.RandomRotation(15),
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
            ]))
        self.testset = datasets.MNIST(
            self.path, train=False, transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))
            ]))
        total_clients = self.config.clients.total
        total_sample = self.trainset.data.shape[0]
        # number of samples on each client
        length = [total_sample // total_clients] * total_clients
        if IID:
            spilted_train = random_split(self.trainset, length)

        else:
            print("None-IID")
            if sum(length) != len(self.trainset):
                raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
            index = []
            for i in range(10):
                index.append([])

            i = 0
            for img, label in self.trainset:
                index[label].append(i)
                i += 1

            indices = np.array([elem for c_list in index for elem in c_list]).reshape(-1, 200)

            np.random.shuffle(indices)
            indices = indices.flatten()
            print(indices.shape)

            spilted_train = [Subset(self.trainset, indices[offset - length:offset]) for offset, length in
                             zip(_accumulate(length), length)]
            print(len(spilted_train))
        return spilted_train, self.testset
Ejemplo n.º 28
0
def sequential_split(dataset, lengths):
    r"""
    Sequentially split a dataset into non-overlapping new datasets of given lengths.

    >>> sequential_split(range(10), [3, 7])

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    indices = list(range(sum(lengths)))
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 29
0
def random_split_before_transform(dataset, lengths, transforms):
    r"""
    Randomly split a dataset into non-overlapping new datasets of given lengths before transforming the input.

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
        transforms: transformation to apply to data
    """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    indices = randperm(sum(lengths)).tolist()
    return [
        TransformableSubset(dataset, indices[offset - length:offset],
                            transform) for offset, length, transform in zip(
                                _accumulate(lengths), lengths, transforms)
    ]
Ejemplo n.º 30
0
def random_split(dataset, valid_split):
    r"""
    Randomly split a dataset into non-overlapping new datasets of given valid_split percentage.

    Arguments:
        dataset (Dataset): Dataset to be split
        valid_split (sequence): percentage of validation data to be produced
    """
    assert 0 <= valid_split < 1
    valid_len = int(len(dataset) * valid_split)
    if valid_len == 0:
        return dataset, None
    train_len = len(dataset) - valid_len

    lengths = [train_len, valid_len]
    indices = randperm(sum(lengths)).tolist()
    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 31
0
def split_dataset(dataset, lengths=(80, 20), random=True):
    """
    split a dataset into non-overlapping new datasets of given lengths.

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (sequence): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        # lengths are proportions
        mult = len(dataset) / sum(lengths)
        lengths = [round(l * mult) for l in lengths]

    if not random:
        indices = torch.arange(0, sum(lengths)).long()  #
    else:
        indices = torch.randperm(sum(lengths))
    return [
        torch.utils.data.Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 32
0
def fixed_random_split(dataset, lengths):
    """Randomly split a dataset into non-overlapping new datasets of given 
  lengths. The seed of this random function is always 42.

  Args:
    dataset (torch Dataset): dataset to be split.
    lengths (list(int)): lengths of splits to be produced
  """
    if sum(lengths) != len(dataset):
        raise ValueError(
            "Sum of input lengths does not equal the length of the input dataset!"
        )

    torch.manual_seed(42)
    indices = randperm(sum(lengths))
    torch.manual_seed(datetime.datetime.now().timestamp())

    return [
        Subset(dataset, indices[offset - length:offset])
        for offset, length in zip(_accumulate(lengths), lengths)
    ]
Ejemplo n.º 33
0
def random_split(dataset, lengths):
    """
    Randomly split a dataset into non-overlapping new datasets of given lengths
    ds

    Arguments:
        dataset (Dataset): Dataset to be split
        lengths (iterable): lengths of splits to be produced
    """
    if sum(lengths) != len(dataset):
        raise ValueError("Sum of input lengths does not equal the length of the input dataset!")

    indices = randperm(sum(lengths))
    return [Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]
Ejemplo n.º 34
0
 def backward(ctx, grad_output):
     return (None,) + tuple(grad_output.narrow(ctx.dim, end - size, size) for size, end
                            in zip(ctx.input_sizes, _accumulate(ctx.input_sizes)))