def generate_synthetic_dataset(num_points: int = 1000000, dim: int = 100, num_clusters: int = 64, cluster_ratio: float = 100.0, radius: float = 1.0) -> clustering_params.Data: """Generates a synthetic dataset. First samples cluster centers within a smaller radius of radius*(1-1/cluster_ratio), so that points added around them stay within radius. Next, num_points/num_clusters many points are sampled from the Gaussian distribution centered at each cluster (if num_points/num_clusters is not an integer, then excess points are in the last cluster). Finally, points are clipped to norm=radius. Args: num_points: The number of data points. dim: The dimension of data points. num_clusters: The number of clusters to divide the points evenly into; extras go in the last cluster. cluster_ratio: The ratio of the intercluster distance to intracluster distance. radius: The radius for all the data to be confined in. At the end, this radius is enforced by scaling any points that are outside the radius. Returns: Data containing sampled datapoints, radius, and labels. """ center_radius = radius * (1 - 1 / float(cluster_ratio)) rand_centers: np.ndarray = sample_uniform_sphere( num_clusters, dim, center_radius) # shape=(num_clusters, dim) datapoints: np.ndarray = np.random.normal( 0, np.sqrt(radius) / (float(cluster_ratio) * np.sqrt(dim)), size=(num_points, dim)) num_points_per_cluster: np.ndarray = np.ones( num_clusters, dtype=int) * (num_points // num_clusters) num_points_per_cluster[-1] += num_points % num_clusters labels = np.concatenate([ np.ones(k, dtype=int) * i for (i, k) in enumerate(num_points_per_cluster) ]) shift_mat: np.ndarray = np.vstack([ np.outer(np.ones(k), v) for (k, v) in zip(num_points_per_cluster, rand_centers) ]) datapoints += shift_mat # Enforce the radius by scaling any points that are outside that range. data = clustering_params.Data(datapoints, radius, labels) return clustering_params.Data(data.clip_by_radius(), data.radius, data.labels)
def test_default_tree_param(self, points, returned_private_count, k, epsilon, expected_min_num_points_in_branching_node, expected_min_num_points_in_node, expected_max_depth, mock_gaussian_noise, mock_private_count): dim = 10 mock_private_count.return_value = returned_private_count data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0) privacy_param = clustering_params.DifferentialPrivacyParam( epsilon=epsilon, delta=1e-2) budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=0.8, frac_group_count=0.2) (tree_param, private_count) = default_clustering_params.default_tree_param( k, data, privacy_param, budget_split) self.assertEqual(tree_param.max_depth, expected_max_depth) if epsilon == np.inf: mock_gaussian_noise.assert_not_called() else: mock_gaussian_noise.assert_called_once_with( common.DifferentialPrivacyParameters(0.8 * epsilon, 1e-2), 1, 1.0) mock_private_count.assert_called_once_with( nonprivate_count=points, count_privacy_param=central_privacy_utils.CountPrivacyParam( epsilon=0.2 * epsilon / (tree_param.max_depth + 1), delta=1e-2)) self.assertEqual(private_count, returned_private_count) self.assertEqual(tree_param.min_num_points_in_node, expected_min_num_points_in_node) self.assertEqual(tree_param.min_num_points_in_branching_node, expected_min_num_points_in_branching_node)
def private_lsh_clustering( k: int, data: clustering_params.Data, privacy_param: clustering_params.DifferentialPrivacyParam, privacy_budget_split: typing.Optional[ clustering_params.PrivacyBudgetSplit] = None, tree_param: typing.Optional[clustering_params.TreeParam] = None, short_description: str = "ClusteringParam") -> ClusteringResult: """Clusters data into k clusters. Args: k: Number of clusters to divide the data into. data: Data to find centers for. Centering the data around the origin beforehand may provide performance improvements. privacy_param: Differential privacy parameters. privacy_budget_split: Optional privacy budget split between operations in the clustering algorithm for fine-tuning. tree_param: Optional tree parameters for generating the LSH net tree for fine-tuning. short_description: Optional description to identify this parameter configuration. Returns: ClusteringResult with differentially private centers. The rest of ClusteringResult is nonprivate, and only provided for convenience. """ # Initialize the parameters. if privacy_budget_split is None: privacy_budget_split = clustering_params.PrivacyBudgetSplit() private_count = None if tree_param is None: # Saves the private count to re-use for the root node of the tree. tree_param, private_count = default_clustering_params.default_tree_param( k, data, privacy_param, privacy_budget_split) clustering_param = clustering_params.ClusteringParam( privacy_param, privacy_budget_split, tree_param, short_description, data.radius) logging.debug("clustering_param: %s", clustering_param) # To guarantee privacy, enforce the radius provided. clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius, data.labels) coreset: private_outputs.PrivateWeightedData = get_private_coreset( clipped_data, clustering_param, private_count) k = min(k, len(coreset.datapoints)) logging.debug( "Starting k-means++ computation on private coreset with k=%d. This may " "be less than the original if generated coreset data ended up with " "less than k unique points.", k) kmeans = sklearn.cluster.KMeans(n_clusters=k, init="k-means++").fit( coreset.datapoints, sample_weight=coreset.weights) # Calculate the result relative to the original data. # Note: the calculations besides the centers are nonprivate. return ClusteringResult(data, kmeans.cluster_centers_)
def test_clustering_result_value_errors_unequal_points(self): centers = np.array([[0, 0, 0], [1, 1, 1]]) datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]]) labels = np.array([0, 1], dtype=int) data = clustering_params.Data(datapoints=datapoints, radius=200) with self.assertRaises(ValueError): clustering_algorithm.ClusteringResult(data, centers, labels, loss=1.0)
def test_clustering_result_value_errors_loss_label_only_one_init(self): centers = np.zeros((2, 3)) datapoints = np.zeros((4, 3)) data = clustering_params.Data(datapoints=datapoints, radius=2) cluster_labels = np.array([0, 0, 1, 1], dtype=int) loss = 1.0 with self.assertRaises(ValueError): clustering_algorithm.ClusteringResult(data, centers, cluster_labels) with self.assertRaises(ValueError): clustering_algorithm.ClusteringResult(data, centers, loss=loss)
def test_value_error_no_true_labels(self): datapoints, radius = np.zeros(shape=(6, 4)), 1.0 data = clustering_params.Data(datapoints, radius) centers = np.zeros(shape=(3, 4)) cluster_labels = np.array([0, 0, 1, 1, 2, 2]) clustering_result = clustering_algorithm.ClusteringResult( data, centers, cluster_labels, loss=1.0) with self.assertRaises(ValueError): clustering_result.cross_label_histogram() with self.assertRaises(ValueError): clustering_result.get_clustering_metrics()
def test_root_node_provide_private_count(self): nonprivate_points = [[1, 2, 1], [0.4, 0.2, 0.8], [3, 0, 3]] data = clustering_params.Data(nonprivate_points, radius=4.3) clustering_param = test_utils.get_test_clustering_param(radius=4.3, max_depth=20) root = lsh_tree.root_node(data, clustering_param, private_count=10) self.assertEqual(root.hash_prefix, '') self.assertSequenceEqual(root.nonprivate_points, nonprivate_points) self.assertEqual(root.clustering_param, clustering_param) self.assertEqual(root.sim_hash.dim, 3) self.assertEqual(root.sim_hash.max_hash_len, 20) self.assertEqual(root.private_count, 10)
def test_clustering_result_value_errors_labels_out_of_bounds(self): centers = np.array([[0, 0, 0], [1, 1, 1]]) datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]]) data = clustering_params.Data(datapoints=datapoints, radius=200) for labels in [ np.array([-1, 0, 1], dtype=int), np.array([0, 1, 2], dtype=int), np.array([0, 1, 1.1]) ]: with self.assertRaises(ValueError): clustering_algorithm.ClusteringResult(data, centers, labels, loss=1.0)
def test_clip_by_radius_default_to_self(self): datapoints = np.array([[0., 0., 0., 0.], [1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.], [13., 14., 15., 16.]]) data = clustering_params.Data(datapoints, radius=10.0) clipped_datapoints = data.clip_by_radius() self.assertLen(clipped_datapoints, 5) self.assertSequenceAlmostEqual(clipped_datapoints[0], [0., 0., 0., 0.]) self.assertSequenceAlmostEqual(clipped_datapoints[1], [1., 2., 3., 4.]) self.assertSequenceAlmostEqual( clipped_datapoints[2], [3.79049022, 4.54858826, 5.30668631, 6.06478435]) self.assertSequenceAlmostEqual( clipped_datapoints[3], [4.26162351, 4.73513724, 5.20865096, 5.68216469]) self.assertSequenceAlmostEqual( clipped_datapoints[4], [4.46949207, 4.81329915, 5.15710623, 5.50091331])
def test_get_clustering_result(self): centers = np.array([[0, 0, 0], [100, 100, 100]]) datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]]) data = clustering_params.Data(datapoints=datapoints, radius=200) clustering_result = clustering_algorithm.ClusteringResult( data, centers) self.assertLen(data.datapoints, 3) for i, datapoint in enumerate(clustering_result.data.datapoints): self.assertSequenceAlmostEqual(datapoints[i], datapoint) self.assertLen(centers, 2) for i, center in enumerate(clustering_result.centers): self.assertSequenceAlmostEqual(centers[i], center) self.assertListEqual(list(clustering_result.labels), [0, 1, 0]) self.assertAlmostEqual(clustering_result.loss, 37)
def test_clipped_data_used_for_clustering_and_not_result_calculation(self): # Clipped datapoints (radius=1): [[0.3, 0.2], [0.6, 0.8], [0.6, 0.8]] datapoints = np.array([[0.3, 0.2], [3, 4], [6, 8]]) # Very small radius means the datapoint will be clipped for the center # calculation. data = clustering_params.Data(datapoints=datapoints, radius=1) # No noise privacy_param = clustering_params.DifferentialPrivacyParam(np.inf) # No branching, the coreset will just be the average of the points tree_param = clustering_params.TreeParam(1, 1, 0) clustering_result = clustering_algorithm.private_lsh_clustering( 3, data, privacy_param, tree_param=tree_param) # Center should be calculated using the clipped data. expected_center = np.array([0.5, 0.6]) self.assertLen(clustering_result.centers, 1) self.assertSequenceAlmostEqual(clustering_result.centers[0], expected_center) self.assertListEqual(list(clustering_result.labels), [0, 0, 0]) # Loss calculation should still be relative to the original points. self.assertAlmostEqual(clustering_result.loss, 103.02)
def test_get_clustering_metrics(self): datapoints, radius = np.zeros(shape=(6, 4)), 1.0 labels = np.array([0, 0, 0, 1, 1, 1]) data = clustering_params.Data(datapoints, radius, labels) centers = np.zeros(shape=(3, 4)) cluster_labels = np.array([0, 0, 1, 1, 2, 2]) clustering_result = clustering_algorithm.ClusteringResult( data, centers, cluster_labels, loss=1.0) clustering_metrics = clustering_result.get_clustering_metrics() expected_cross_label_histogram = np.array([[2, 0], [1, 1], [0, 2]], dtype=int) self.assertTrue((clustering_metrics.cross_label_histogram == expected_cross_label_histogram).all()) self.assertEqual(clustering_metrics.num_points, 6) self.assertEqual(clustering_metrics.dominant_label_correct_count, 5) self.assertAlmostEqual(clustering_metrics.dominant_label_accuracy, 5 / 6) self.assertEqual(clustering_metrics.true_pairs, 6) self.assertEqual(clustering_metrics.true_nonmatch_count, 4) self.assertAlmostEqual(clustering_metrics.true_nonmatch_frac, 4 / 6) self.assertEqual(clustering_metrics.false_pairs, 9) self.assertEqual(clustering_metrics.false_match_count, 1) self.assertAlmostEqual(clustering_metrics.false_match_frac, 1 / 9)
def test_small_dataset(self): datapoints = np.array([[0.3, 0.2]]) data = clustering_params.Data(datapoints=datapoints, radius=1) self.assertIsNotNone( clustering_algorithm.private_lsh_clustering( self.baseline_k, data, self.baseline_privacy_param))
def test_data_label_unequal_length(self): points, dim = 10, 3 datapoints = np.zeros(shape=(points, dim)) labels = np.ones(points-1, dtype=int) with self.assertRaises(ValueError): clustering_params.Data(datapoints, radius=1.0, labels=labels)
def test_data(self): (points, dim) = (10, 3) data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0) self.assertEqual(data.num_points, points) self.assertEqual(data.dim, dim) self.assertEqual(data.radius, 1.0)