Ejemplo n.º 1
0
def generate_dataset(dir_path, num_clients, num_classes, niid, real, partition,
                     balance):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Setup directory for train/test data
    config_path = dir_path + "config.json"
    train_path = dir_path + "train/train.json"
    test_path = dir_path + "test/test.json"

    if check(config_path, train_path, test_path, num_clients, num_classes,
             niid, real, partition):
        return

    # Get data
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = ImageFolder_custom(root=dir_path +
                                  'rawdata/tiny-imagenet-200/train/',
                                  transform=transform)
    testset = ImageFolder_custom(root=dir_path +
                                 'rawdata/tiny-imagenet-200/val/',
                                 transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=len(trainset),
                                              shuffle=False)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=len(testset),
                                             shuffle=False)

    for _, train_data in enumerate(trainloader, 0):
        trainset.data, trainset.targets = train_data
    for _, test_data in enumerate(testloader, 0):
        testset.data, testset.targets = test_data

    dataset_image = []
    dataset_label = []

    dataset_image.extend(trainset.data.cpu().detach().numpy())
    dataset_image.extend(testset.data.cpu().detach().numpy())
    dataset_label.extend(trainset.targets.cpu().detach().numpy())
    dataset_label.extend(testset.targets.cpu().detach().numpy())
    dataset_image = np.array(dataset_image)
    dataset_label = np.array(dataset_label)

    # dataset = []
    # for i in range(num_classes):
    #     idx = dataset_label == i
    #     dataset.append(dataset_image[idx])

    X, y, statistic = separate_data((dataset_image, dataset_label),
                                    num_clients, num_classes, niid, real,
                                    partition, balance)
    train_data, test_data = split_data(X, y)
    save_file(config_path, train_path, test_path, train_data, test_data,
              num_clients, num_classes, statistic, niid, real, partition)
Ejemplo n.º 2
0
def generate_fmnist(dir_path,
                    num_clients,
                    num_classes,
                    niid=False,
                    real=True,
                    partition=None):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Setup directory for train/test data
    config_path = dir_path + "config.json"
    train_path = dir_path + "train/train.json"
    test_path = dir_path + "test/test.json"

    if check(config_path, train_path, test_path, num_clients, num_classes,
             niid, real, partition):
        return

    # Get FashionMNIST data
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize([0.5], [0.5])])

    trainset = torchvision.datasets.FashionMNIST(root=dir_path + "rawdata",
                                                 train=True,
                                                 download=True,
                                                 transform=transform)
    testset = torchvision.datasets.FashionMNIST(root=dir_path + "rawdata",
                                                train=False,
                                                download=True,
                                                transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=len(trainset.data),
                                              shuffle=False)
    testloader = torch.utils.data.DataLoader(testset,
                                             batch_size=len(testset.data),
                                             shuffle=False)

    for _, train_data in enumerate(trainloader, 0):
        trainset.data, trainset.targets = train_data
    for _, test_data in enumerate(testloader, 0):
        testset.data, testset.targets = test_data

    dataset_image = []
    dataset_label = []

    dataset_image.extend(trainset.data.cpu().detach().numpy())
    dataset_image.extend(testset.data.cpu().detach().numpy())
    dataset_label.extend(trainset.targets.cpu().detach().numpy())
    dataset_label.extend(testset.targets.cpu().detach().numpy())
    dataset_image = np.array(dataset_image)
    dataset_label = np.array(dataset_label)

    # dataset = []
    # for i in range(num_classes):
    #     idx = dataset_label == i
    #     dataset.append(dataset_image[idx])

    X, y, statistic = separate_data((dataset_image, dataset_label),
                                    num_clients, num_classes, niid, real,
                                    partition)
    train_data, test_data = split_data(X, y)
    save_file(config_path, train_path, test_path, train_data, test_data,
              num_clients, num_classes, statistic, niid, real, partition)
Ejemplo n.º 3
0
def generate_agnews(dir_path,
                    num_clients,
                    num_classes,
                    niid=False,
                    real=True,
                    partition=None):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Setup directory for train/test data
    config_path = dir_path + "config.json"
    train_path = dir_path + "train/train.json"
    test_path = dir_path + "test/test.json"

    if check(config_path, train_path, test_path, num_clients, num_classes,
             niid, real, partition):
        return

    # Get AG_News data
    trainset, testset = torchtext.datasets.AG_NEWS(root=dir_path + "rawdata")

    trainlabel, traintext = list(zip(*trainset))
    testlabel, testtext = list(zip(*testset))

    dataset_text = []
    dataset_label = []

    dataset_text.extend(traintext)
    dataset_text.extend(testtext)
    dataset_label.extend(trainlabel)
    dataset_label.extend(testlabel)

    tokenizer = get_tokenizer('basic_english')
    vocab = build_vocab_from_iterator(map(tokenizer, iter(dataset_text)),
                                      specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])

    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: int(x) - 1

    def text_transform(text, label, max_len=0):
        label_list, text_list = [], []
        for _text, _label in zip(text, label):
            label_list.append(label_pipeline(_label))
            text_ = text_pipeline(_text)
            padding = [0 for i in range(max_len - len(text_))]
            text_.extend(padding)
            text_list.append(text_[:max_len])
        return label_list, text_list

    label_list, text_list = text_transform(dataset_text, dataset_label,
                                           max_len)

    text_lens = [len(text) for text in text_list]
    # max_len = max(text_lens)
    # label_list, text_list = text_transform(dataset_text, dataset_label, max_len)

    text_list = [(text, l) for text, l in zip(text_list, text_lens)]

    text_list = np.array(text_list, dtype=object)
    label_list = np.array(label_list)

    # dataset = []
    # for i in range(num_classes):
    #     idx = label_list == i
    #     dataset.append(text_list[idx])

    X, y, statistic = separate_data((text_list, label_list), num_clients,
                                    num_classes, niid, real, partition)
    train_data, test_data = split_data(X, y)
    save_file(config_path, train_path, test_path, train_data, test_data,
              num_clients, num_classes, statistic, niid, real, partition)

    print("The size of vocabulary:", len(vocab))