Example #1
0
    def test_synthetic_circles(self):
        print('''
            two concentric circles
        ''')
        N = 10**3
        X, y = make_circles(n_samples=N, noise=1.0)
        k = len(np.unique(y))

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'sklearn mse: {sklearn_mse}')
        print(f'sklearn scores: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 100  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 100  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50  # keep it < 100, works fast

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Klines scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.5
    def test_benchmark_Chainlink(self):
        print('Clustering Chainlink.npz')
        npzfile = np.load('data/Chainlink.npz')
        X, y = npzfile['X'], npzfile['y']
        (N, _), k = X.shape, np.unique(y).shape[0]
        print(f'#Datapoints {N}')

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'MSE sklearn: {sklearn_mse}')
        print(f'MSE scores/measures: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 60  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 40  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.8
Example #3
0
    def test_speed(self):
        ## data
        k = 3
        N = int(1e3)
        m = int(N*0.07)  # coreset size ~ reduction ratio
        tau = 1e-3

        straight_roads = np.load('data/road_segments_china.npy')
        straight_roads = straight_roads[np.random.choice(straight_roads.shape[0], N, replace=False)]
        L = [[x[0][0], x[0][1], x[1][0], x[1][1]] for x in straight_roads]

        ## construct set of lines
        L = SetOfLines([], [], [], [], L, True)

        config = ParameterConfig()
        config.a_b_approx_minimum_number_of_lines = int(N*0.01) # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(m*1.01) # |S| >= m, line 3 of algo 2
                                                        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4.0/11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1-tau)/(2*k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(N*0.05)    # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 50  # for outliers in coresets


        coreset = CorsetForKMeansForLines(config)


        # ## statistical analysis
        ###  MEAN AND VAR EVALUATION
        ITER = 2
        errors = np.array([coreset.coreset(L, k, m, True)[2]  for _ in range(ITER)])
        print(f"mean: {errors.mean()}")
        print(f"var: {errors.var()}")

        ## more tau => more variance
        ## more max_sensitivity_multiply_factor => less variance
        ## kept median_sample_size small, ~5% of N, coz coresets candidate set progressively reduces

        # note size of B will be ~ O(log(n) * m^2)
        # and ofcourse its not K-center
        _, B, _ = coreset.coreset(L, k, m, True)
        config.number_of_remains = int(math.log(B.get_size())) # this is also `b`, line 1, algo 2, other paper
                                                            # value copied from `recursive_robust_median` method
            
        cwc = CoresetForWeightedCenters(config)

        ### FOR TIME EVALUATION        
        X = []
        ITER = 2
        for _ in range(ITER):
            st = timeit.default_timer()
            cwc.coreset(B, k, m)
            X.append(timeit.default_timer() - st)
            
        X = np.array(X)
        print(f"Mean time taken for {ITER} calls is {X.mean()}s")

        assert X.mean() < 10

        import cProfile
        pr = cProfile.Profile()
        pr.enable()

        cwc.coreset(B, k, m)

        pr.disable()
        pr.print_stats(sort='cumtime')
Example #4
0
#################################### Klines specific code


if rank == MASTER:
    sendbuf = np.load('road_segments_china.npy') # 8.7e5 entries
    sendbuf = sendbuf[:N_tot]
    sendbuf = np.array_split(sendbuf, p)
else:
    sendbuf = None
        
# distribute
recbuf = comm.scatter(sendbuf, root=MASTER)

# now every process (including MASTER) has equal sized L ~ N/p
L = [[x[0][0], x[0][1], x[1][0], x[1][1]] for x in recbuf]
L = SetOfLines([], [], [], [], L, True)


# define the streamer
SAMPLE_SIZE = 50
streamer = CoresetStreamer(SAMPLE_SIZE, N, k, config)

coreset = streamer.stream(L)
print(f"PID {rank}. Time taken: {coreset[2]-coreset[1]}s. Lines size: {coreset[0].get_size()}")

# now MASTER gathers a list of coresets
lines = comm.gather(coreset[0], root=MASTER)

if rank == MASTER:
    ckl = CorsetForKMeansForLines(config)
    cwc = CoresetForWeightedCenters(config)
Example #5
0
    def test_scores(self):
        data = load_iris()
        X = data.data
        y = data.target
        k = len(np.unique(y))

        X_incomplete = create_incomplete_matrix(X)
            
        # X is the complete data matrix
        # X_incomplete has the same values as X except a subset have been replace with NaN
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        metrics.homogeneity_completeness_v_measure(labels, y)

        klines_mse_sklearn = ((X - X_hat)**2).mean()

        # ## Clustering using KLines
        displacements = np.nan_to_num(X_incomplete)

        N, _ = X_incomplete.shape
        spans = np.nan_to_num(X_incomplete)
        spans[spans==0] = 1
        spans[spans!=1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        
        config = ParameterConfig()

        ## data
        k = 3
        m = min(int(N*0.1), 100)  # coreset size ~ reduction ratio
        tau = 1e-3

        config.a_b_approx_minimum_number_of_lines = min(int(N*0.1), 100) # constant 100, line 2, algo 2 BI-CRITERIA

        config.sample_size_for_a_b_approx = int(m*1.05) # |S| >= m, line 3 of algo 2
                                                        # note: there'll be a O(|S|^2) cost while computing algo 1
            
        config.farthest_to_centers_rate_in_a_b_approx = 0.25  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1-tau)/(2*k)  # refer line 4, algo 1, other paper

        config.median_sample_size = int(N*0.05)    # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 2  # for outliers in coresets
        config.number_of_remains = min(int(N*0.05), 20)


        def util():
            _, B, _ = CorsetForKMeansForLines(config).coreset(L, k, m, True)

            # reduce the coreset of random centers
            cwc = CoresetForWeightedCenters(config)
            MAX_ITER = 5
            while MAX_ITER > 0:        
                B = cwc.coreset(B, k, m)
                if B.get_size() <= k:
                    MAX_ITER = 0
                MAX_ITER -= 1 
            
            X_klines = L.get_projected_centers(B)
            kl_labels = L.get_indices_clusters(B)
            return X_klines, kl_labels

        klines_mse = []
        scores = []
        ITER = 10
        for i in range(ITER):
            X_klines, kl_labels = util()
            klines_mse.append(((X - X_klines)**2).mean())
            scores.append(metrics.homogeneity_completeness_v_measure(kl_labels, y))

        assert klines_mse_sklearn/np.array(klines_mse).mean() < 0.6