def generate_dataset(dir_path, num_clients, num_classes, niid, real, partition, balance): if not os.path.exists(dir_path): os.makedirs(dir_path) # Setup directory for train/test data config_path = dir_path + "config.json" train_path = dir_path + "train/train.json" test_path = dir_path + "test/test.json" if check(config_path, train_path, test_path, num_clients, num_classes, niid, real, partition): return # Get data transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) trainset = ImageFolder_custom(root=dir_path + 'rawdata/tiny-imagenet-200/train/', transform=transform) testset = ImageFolder_custom(root=dir_path + 'rawdata/tiny-imagenet-200/val/', transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset), shuffle=False) testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset), shuffle=False) for _, train_data in enumerate(trainloader, 0): trainset.data, trainset.targets = train_data for _, test_data in enumerate(testloader, 0): testset.data, testset.targets = test_data dataset_image = [] dataset_label = [] dataset_image.extend(trainset.data.cpu().detach().numpy()) dataset_image.extend(testset.data.cpu().detach().numpy()) dataset_label.extend(trainset.targets.cpu().detach().numpy()) dataset_label.extend(testset.targets.cpu().detach().numpy()) dataset_image = np.array(dataset_image) dataset_label = np.array(dataset_label) # dataset = [] # for i in range(num_classes): # idx = dataset_label == i # dataset.append(dataset_image[idx]) X, y, statistic = separate_data((dataset_image, dataset_label), num_clients, num_classes, niid, real, partition, balance) train_data, test_data = split_data(X, y) save_file(config_path, train_path, test_path, train_data, test_data, num_clients, num_classes, statistic, niid, real, partition)
def generate_fmnist(dir_path, num_clients, num_classes, niid=False, real=True, partition=None): if not os.path.exists(dir_path): os.makedirs(dir_path) # Setup directory for train/test data config_path = dir_path + "config.json" train_path = dir_path + "train/train.json" test_path = dir_path + "test/test.json" if check(config_path, train_path, test_path, num_clients, num_classes, niid, real, partition): return # Get FashionMNIST data transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]) trainset = torchvision.datasets.FashionMNIST(root=dir_path + "rawdata", train=True, download=True, transform=transform) testset = torchvision.datasets.FashionMNIST(root=dir_path + "rawdata", train=False, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=len(trainset.data), shuffle=False) testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset.data), shuffle=False) for _, train_data in enumerate(trainloader, 0): trainset.data, trainset.targets = train_data for _, test_data in enumerate(testloader, 0): testset.data, testset.targets = test_data dataset_image = [] dataset_label = [] dataset_image.extend(trainset.data.cpu().detach().numpy()) dataset_image.extend(testset.data.cpu().detach().numpy()) dataset_label.extend(trainset.targets.cpu().detach().numpy()) dataset_label.extend(testset.targets.cpu().detach().numpy()) dataset_image = np.array(dataset_image) dataset_label = np.array(dataset_label) # dataset = [] # for i in range(num_classes): # idx = dataset_label == i # dataset.append(dataset_image[idx]) X, y, statistic = separate_data((dataset_image, dataset_label), num_clients, num_classes, niid, real, partition) train_data, test_data = split_data(X, y) save_file(config_path, train_path, test_path, train_data, test_data, num_clients, num_classes, statistic, niid, real, partition)
def generate_agnews(dir_path, num_clients, num_classes, niid=False, real=True, partition=None): if not os.path.exists(dir_path): os.makedirs(dir_path) # Setup directory for train/test data config_path = dir_path + "config.json" train_path = dir_path + "train/train.json" test_path = dir_path + "test/test.json" if check(config_path, train_path, test_path, num_clients, num_classes, niid, real, partition): return # Get AG_News data trainset, testset = torchtext.datasets.AG_NEWS(root=dir_path + "rawdata") trainlabel, traintext = list(zip(*trainset)) testlabel, testtext = list(zip(*testset)) dataset_text = [] dataset_label = [] dataset_text.extend(traintext) dataset_text.extend(testtext) dataset_label.extend(trainlabel) dataset_label.extend(testlabel) tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, iter(dataset_text)), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) text_pipeline = lambda x: vocab(tokenizer(x)) label_pipeline = lambda x: int(x) - 1 def text_transform(text, label, max_len=0): label_list, text_list = [], [] for _text, _label in zip(text, label): label_list.append(label_pipeline(_label)) text_ = text_pipeline(_text) padding = [0 for i in range(max_len - len(text_))] text_.extend(padding) text_list.append(text_[:max_len]) return label_list, text_list label_list, text_list = text_transform(dataset_text, dataset_label, max_len) text_lens = [len(text) for text in text_list] # max_len = max(text_lens) # label_list, text_list = text_transform(dataset_text, dataset_label, max_len) text_list = [(text, l) for text, l in zip(text_list, text_lens)] text_list = np.array(text_list, dtype=object) label_list = np.array(label_list) # dataset = [] # for i in range(num_classes): # idx = label_list == i # dataset.append(text_list[idx]) X, y, statistic = separate_data((text_list, label_list), num_clients, num_classes, niid, real, partition) train_data, test_data = split_data(X, y) save_file(config_path, train_path, test_path, train_data, test_data, num_clients, num_classes, statistic, niid, real, partition) print("The size of vocabulary:", len(vocab))