def test_create_lda_partitions_imbalanced_not_set(self) -> None: """Test if Latent Dirichlet Allocation rejects imbalanced partitions.""" # Prepare num_partitions = 3 concentration = 1e-3 # Execute with self.assertRaises(ValueError): create_lda_partitions( dataset=self.ds, num_partitions=num_partitions, concentration=concentration, )
def test_create_lda_partitions_elements_wrong_list_concentration( self) -> None: """Test if partitions from Latent Dirichlet Allocation contain the same elements.""" # Prepare num_partitions = 5 concentration = (self.num_classes + 1) * [0.5] # Execute with self.assertRaises(ValueError): create_lda_partitions( dataset=self.ds, num_partitions=num_partitions, concentration=concentration, )
def do_fl_partitioning(path_to_dataset, pool_size, alpha, num_classes, val_ratio=0.0): """Torchvision (e.g. CIFAR-10) datasets using LDA.""" images, labels = torch.load(path_to_dataset) idx = np.array(range(len(images))) dataset = [idx, labels] partitions, _ = create_lda_partitions(dataset, num_partitions=pool_size, concentration=alpha, accept_imbalanced=True) # Show label distribution for first partition (purely informative) partition_zero = partitions[0][1] hist, _ = np.histogram(partition_zero, bins=list(range(num_classes + 1))) print( f"Class histogram for 0-th partition (alpha={alpha}, {num_classes} classes): {hist}" ) # now save partitioned dataset to disk # first delete dir containing splits (if exists), then create it splits_dir = path_to_dataset.parent / "federated" if splits_dir.exists(): shutil.rmtree(splits_dir) Path.mkdir(splits_dir, parents=True) for p in range(pool_size): labels = partitions[p][1] image_idx = partitions[p][0] imgs = images[image_idx] # create dir Path.mkdir(splits_dir / str(p)) if val_ratio > 0.0: # split data according to val_ratio train_idx, val_idx = get_random_id_splits(len(labels), val_ratio) val_imgs = imgs[val_idx] val_labels = labels[val_idx] with open(splits_dir / str(p) / "val.pt", "wb") as f: torch.save([val_imgs, val_labels], f) # remaining images for training imgs = imgs[train_idx] labels = labels[train_idx] with open(splits_dir / str(p) / "train.pt", "wb") as f: torch.save([imgs, labels], f) return splits_dir
def test_create_lda_partitions_elements_list_concentration(self) -> None: """Test if partitions from Latent Dirichlet Allocation contain the same elements.""" # Prepare num_partitions = 5 concentration = self.num_classes * [0.5] # Execute partitions, _ = create_lda_partitions(dataset=self.ds, num_partitions=num_partitions, concentration=concentration) x_lda = np.concatenate([item[0] for item in partitions]) y_lda = np.concatenate([item[1] for item in partitions]) # Assert assert_identity(xy_0=self.ds, xy_1=(x_lda, y_lda))
def test_create_lda_partitions_alpha_near_zero(self) -> None: """Test if Latent Dirichlet Allocation partitions will give single class distribution when concentration is near zero (~1e-3).""" # Prepare num_partitions = 5 concentration = 1e-3 # Execute _, distributions = create_lda_partitions(dataset=self.ds, num_partitions=num_partitions, concentration=concentration) test_num_partitions, _ = distributions.shape # Assert for part in range(test_num_partitions): this_distribution = distributions[part] max_prob = np.max(this_distribution) assert max_prob > 0.5
def test_create_lda_partitions_imbalanced(self) -> None: """Test if Latent Dirichlet Allocation accepts imbalanced partitions if accept_imbalanced is set.""" # Prepare num_partitions = 3 concentration = 1e-3 # Execute partitions, _ = create_lda_partitions( dataset=self.ds, num_partitions=num_partitions, concentration=concentration, accept_imbalanced=True, ) numel_list = [x.shape[0] for (x, y) in partitions] total_samples = np.sum(numel_list) # Assert assert total_samples == self.num_samples
def test_create_lda_partitions_large_alpha(self) -> None: """Test if Latent Dirichlet Allocation partitions will give near uniform distribution when concentration is large(~1e5).""" # Prepare num_partitions = 5 concentration = 1e5 uniform = (1.0 / (self.num_classes) * np.ones( (self.num_classes, ), dtype=np.float)) # Execute _, distributions = create_lda_partitions(dataset=self.ds, num_partitions=num_partitions, concentration=concentration) test_num_partitions, _ = distributions.shape # Assert for part in range(test_num_partitions): this_distribution = distributions[part] np.testing.assert_array_almost_equal(this_distribution, uniform, decimal=3)
def test_create_lda_partitions_with_inf_alpha(self) -> None: """Test if partitions created with concentration=Inf will produce uniform partitions.""" # Prepare num_partitions = 5 concentration = float("inf") # Execute partitions, dirichlet_dist = create_lda_partitions( dataset=self.ds, num_partitions=num_partitions, concentration=concentration) x_lda = np.concatenate([item[0] for item in partitions]) y_lda = np.concatenate([item[1] for item in partitions]) # Assert np.testing.assert_array_equal( dirichlet_dist, 1.0 / self.num_classes * np.ones( (num_partitions, self.num_classes), dtype=np.float32), ) assert_identity(xy_0=self.ds, xy_1=(x_lda, y_lda))
'update_interval': 55, 'ae_loss': 'mse', 'cl_loss': 'kld' } outcomes = None # dataset, building the whole one and get the local if DATASET == 'blobs': n_features = 30 X = datasets.make_blobs(n_samples=N_SAMPLES, n_features=n_features, random_state=SEED, centers=N_CLUSTERS) if USE_LDA: Y, _ = create_lda_partitions(dataset=X, num_partitions=N_CLIENTS) else: Y = create_partitions(unpartitioned_dataset=X, iid_fraction=0.5, num_partitions=N_CLIENTS) x, y = Y[CLIENT_ID] # .copy() # dimensions of the autoencoder dense layers dims = [x.shape[-1], int(4 * n_features), N_CLUSTERS] del X, Y elif DATASET == 'moons': x_tot, y_tot = data_util.build_dataset(N_CLIENTS, N_SAMPLES, R_NOISE, SEED) x_train, y_train = data_util.get_client_dataset( args.client_id, N_CLIENTS, x_tot, y_tot) # dimensions of the autoencoder dense layers dims = [x.shape[-1], 8, 8, 32, N_CLUSTERS]