def test_synthetic_circles(self): print(''' two concentric circles ''') N = 10**3 X, y = make_circles(n_samples=N, noise=1.0) k = len(np.unique(y)) X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'sklearn mse: {sklearn_mse}') print(f'sklearn scores: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 100 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 100 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 # keep it < 100, works fast ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Klines scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.5
def test_benchmark_Chainlink(self): print('Clustering Chainlink.npz') npzfile = np.load('data/Chainlink.npz') X, y = npzfile['X'], npzfile['y'] (N, _), k = X.shape, np.unique(y).shape[0] print(f'#Datapoints {N}') X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'MSE sklearn: {sklearn_mse}') print(f'MSE scores/measures: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 60 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 40 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.8
def test_scores(self): data = load_iris() X = data.data y = data.target k = len(np.unique(y)) X_incomplete = create_incomplete_matrix(X) # X is the complete data matrix # X_incomplete has the same values as X except a subset have been replace with NaN labels, _, X_hat = kmeans_missing(X_incomplete, k) metrics.homogeneity_completeness_v_measure(labels, y) klines_mse_sklearn = ((X - X_hat)**2).mean() # ## Clustering using KLines displacements = np.nan_to_num(X_incomplete) N, _ = X_incomplete.shape spans = np.nan_to_num(X_incomplete) spans[spans==0] = 1 spans[spans!=1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data k = 3 m = min(int(N*0.1), 100) # coreset size ~ reduction ratio tau = 1e-3 config.a_b_approx_minimum_number_of_lines = min(int(N*0.1), 100) # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int(m*1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 0.25 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1-tau)/(2*k) # refer line 4, algo 1, other paper config.median_sample_size = int(N*0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 2 # for outliers in coresets config.number_of_remains = min(int(N*0.05), 20) def util(): _, B, _ = CorsetForKMeansForLines(config).coreset(L, k, m, True) # reduce the coreset of random centers cwc = CoresetForWeightedCenters(config) MAX_ITER = 5 while MAX_ITER > 0: B = cwc.coreset(B, k, m) if B.get_size() <= k: MAX_ITER = 0 MAX_ITER -= 1 X_klines = L.get_projected_centers(B) kl_labels = L.get_indices_clusters(B) return X_klines, kl_labels klines_mse = [] scores = [] ITER = 10 for i in range(ITER): X_klines, kl_labels = util() klines_mse.append(((X - X_klines)**2).mean()) scores.append(metrics.homogeneity_completeness_v_measure(kl_labels, y)) assert klines_mse_sklearn/np.array(klines_mse).mean() < 0.6