def test_synthetic_circles(self): print(''' two concentric circles ''') N = 10**3 X, y = make_circles(n_samples=N, noise=1.0) k = len(np.unique(y)) X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'sklearn mse: {sklearn_mse}') print(f'sklearn scores: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 100 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 100 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 # keep it < 100, works fast ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Klines scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.5
def test_benchmark_Chainlink(self): print('Clustering Chainlink.npz') npzfile = np.load('data/Chainlink.npz') X, y = npzfile['X'], npzfile['y'] (N, _), k = X.shape, np.unique(y).shape[0] print(f'#Datapoints {N}') X_incomplete = create_incomplete_matrix(X) labels, _, X_hat = kmeans_missing(X_incomplete, k) sklearn_mse = ((X - X_hat)**2).mean() score = metrics.homogeneity_completeness_v_measure(labels, y) print(f'MSE sklearn: {sklearn_mse}') print(f'MSE scores/measures: {score}') displacements = np.nan_to_num(X_incomplete) spans = np.nan_to_num(X_incomplete) spans[spans == 0] = 1 spans[spans != 1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data m = 60 # coreset size ~ reduction ratio tau = 1e-2 config.a_b_approx_minimum_number_of_lines = 40 # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int( m * 1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4 / 11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int( math.log(N) ) // k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1 - tau) / ( 2 * k) # refer line 4, algo 1, other paper config.median_sample_size = int( N * 0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 100 # for outliers in coresets config.number_of_remains = 20 SAMPLE_SIZE = 50 ITER = 5 klines_mse = np.zeros(ITER) scores = [[]] * ITER for i in range(ITER): print(f'Running KLines iter {i+1} of {ITER}') X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config) klines_mse[i] = ((X - X_klines)**2).mean() scores[i] = metrics.homogeneity_completeness_v_measure( kl_labels, y) print(f"Klines MSE: {klines_mse.mean()}") print(f"Scores: {np.array(scores).mean(axis=0)}") assert sklearn_mse / klines_mse.mean() > 0.8
def test_speed(self): ## data k = 3 N = int(1e3) m = int(N*0.07) # coreset size ~ reduction ratio tau = 1e-3 straight_roads = np.load('data/road_segments_china.npy') straight_roads = straight_roads[np.random.choice(straight_roads.shape[0], N, replace=False)] L = [[x[0][0], x[0][1], x[1][0], x[1][1]] for x in straight_roads] ## construct set of lines L = SetOfLines([], [], [], [], L, True) config = ParameterConfig() config.a_b_approx_minimum_number_of_lines = int(N*0.01) # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int(m*1.01) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 4.0/11 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1-tau)/(2*k) # refer line 4, algo 1, other paper config.median_sample_size = int(N*0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 50 # for outliers in coresets coreset = CorsetForKMeansForLines(config) # ## statistical analysis ### MEAN AND VAR EVALUATION ITER = 2 errors = np.array([coreset.coreset(L, k, m, True)[2] for _ in range(ITER)]) print(f"mean: {errors.mean()}") print(f"var: {errors.var()}") ## more tau => more variance ## more max_sensitivity_multiply_factor => less variance ## kept median_sample_size small, ~5% of N, coz coresets candidate set progressively reduces # note size of B will be ~ O(log(n) * m^2) # and ofcourse its not K-center _, B, _ = coreset.coreset(L, k, m, True) config.number_of_remains = int(math.log(B.get_size())) # this is also `b`, line 1, algo 2, other paper # value copied from `recursive_robust_median` method cwc = CoresetForWeightedCenters(config) ### FOR TIME EVALUATION X = [] ITER = 2 for _ in range(ITER): st = timeit.default_timer() cwc.coreset(B, k, m) X.append(timeit.default_timer() - st) X = np.array(X) print(f"Mean time taken for {ITER} calls is {X.mean()}s") assert X.mean() < 10 import cProfile pr = cProfile.Profile() pr.enable() cwc.coreset(B, k, m) pr.disable() pr.print_stats(sort='cumtime')
#################################### Klines specific code if rank == MASTER: sendbuf = np.load('road_segments_china.npy') # 8.7e5 entries sendbuf = sendbuf[:N_tot] sendbuf = np.array_split(sendbuf, p) else: sendbuf = None # distribute recbuf = comm.scatter(sendbuf, root=MASTER) # now every process (including MASTER) has equal sized L ~ N/p L = [[x[0][0], x[0][1], x[1][0], x[1][1]] for x in recbuf] L = SetOfLines([], [], [], [], L, True) # define the streamer SAMPLE_SIZE = 50 streamer = CoresetStreamer(SAMPLE_SIZE, N, k, config) coreset = streamer.stream(L) print(f"PID {rank}. Time taken: {coreset[2]-coreset[1]}s. Lines size: {coreset[0].get_size()}") # now MASTER gathers a list of coresets lines = comm.gather(coreset[0], root=MASTER) if rank == MASTER: ckl = CorsetForKMeansForLines(config) cwc = CoresetForWeightedCenters(config)
def test_scores(self): data = load_iris() X = data.data y = data.target k = len(np.unique(y)) X_incomplete = create_incomplete_matrix(X) # X is the complete data matrix # X_incomplete has the same values as X except a subset have been replace with NaN labels, _, X_hat = kmeans_missing(X_incomplete, k) metrics.homogeneity_completeness_v_measure(labels, y) klines_mse_sklearn = ((X - X_hat)**2).mean() # ## Clustering using KLines displacements = np.nan_to_num(X_incomplete) N, _ = X_incomplete.shape spans = np.nan_to_num(X_incomplete) spans[spans==0] = 1 spans[spans!=1] = 0 L = SetOfLines(spans, displacements, np.ones(N), np.ones(N)) config = ParameterConfig() ## data k = 3 m = min(int(N*0.1), 100) # coreset size ~ reduction ratio tau = 1e-3 config.a_b_approx_minimum_number_of_lines = min(int(N*0.1), 100) # constant 100, line 2, algo 2 BI-CRITERIA config.sample_size_for_a_b_approx = int(m*1.05) # |S| >= m, line 3 of algo 2 # note: there'll be a O(|S|^2) cost while computing algo 1 config.farthest_to_centers_rate_in_a_b_approx = 0.25 # opp of 7/11, line 6, algo 2 BI-CRITERIA config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here - how to calculate it? config.closest_to_median_rate = (1-tau)/(2*k) # refer line 4, algo 1, other paper config.median_sample_size = int(N*0.05) # size of q_i, line 3, algo 2, other paper config.max_sensitivity_multiply_factor = 2 # for outliers in coresets config.number_of_remains = min(int(N*0.05), 20) def util(): _, B, _ = CorsetForKMeansForLines(config).coreset(L, k, m, True) # reduce the coreset of random centers cwc = CoresetForWeightedCenters(config) MAX_ITER = 5 while MAX_ITER > 0: B = cwc.coreset(B, k, m) if B.get_size() <= k: MAX_ITER = 0 MAX_ITER -= 1 X_klines = L.get_projected_centers(B) kl_labels = L.get_indices_clusters(B) return X_klines, kl_labels klines_mse = [] scores = [] ITER = 10 for i in range(ITER): X_klines, kl_labels = util() klines_mse.append(((X - X_klines)**2).mean()) scores.append(metrics.homogeneity_completeness_v_measure(kl_labels, y)) assert klines_mse_sklearn/np.array(klines_mse).mean() < 0.6