Esempio n. 1
0
    def test_synthetic_circles(self):
        print('''
            two concentric circles
        ''')
        N = 10**3
        X, y = make_circles(n_samples=N, noise=1.0)
        k = len(np.unique(y))

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'sklearn mse: {sklearn_mse}')
        print(f'sklearn scores: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 100  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 100  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50  # keep it < 100, works fast

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Klines scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.5
    def test_benchmark_Chainlink(self):
        print('Clustering Chainlink.npz')
        npzfile = np.load('data/Chainlink.npz')
        X, y = npzfile['X'], npzfile['y']
        (N, _), k = X.shape, np.unique(y).shape[0]
        print(f'#Datapoints {N}')

        X_incomplete = create_incomplete_matrix(X)
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        sklearn_mse = ((X - X_hat)**2).mean()
        score = metrics.homogeneity_completeness_v_measure(labels, y)
        print(f'MSE sklearn: {sklearn_mse}')
        print(f'MSE scores/measures: {score}')

        displacements = np.nan_to_num(X_incomplete)

        spans = np.nan_to_num(X_incomplete)
        spans[spans == 0] = 1
        spans[spans != 1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        config = ParameterConfig()

        ## data
        m = 60  # coreset size ~ reduction ratio
        tau = 1e-2

        config.a_b_approx_minimum_number_of_lines = 40  # constant 100, line 2, algo 2 BI-CRITERIA
        config.sample_size_for_a_b_approx = int(
            m * 1.05)  # |S| >= m, line 3 of algo 2
        # note: there'll be a O(|S|^2) cost while computing algo 1
        config.farthest_to_centers_rate_in_a_b_approx = 4 / 11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(
            math.log(N)
        ) // k  # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1 - tau) / (
            2 * k)  # refer line 4, algo 1, other paper
        config.median_sample_size = int(
            N * 0.05)  # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 100  # for outliers in coresets
        config.number_of_remains = 20

        SAMPLE_SIZE = 50

        ITER = 5
        klines_mse = np.zeros(ITER)
        scores = [[]] * ITER
        for i in range(ITER):
            print(f'Running KLines iter {i+1} of {ITER}')
            X_klines, kl_labels = customStreamer(L, k, m, SAMPLE_SIZE, config)
            klines_mse[i] = ((X - X_klines)**2).mean()
            scores[i] = metrics.homogeneity_completeness_v_measure(
                kl_labels, y)

        print(f"Klines MSE: {klines_mse.mean()}")
        print(f"Scores: {np.array(scores).mean(axis=0)}")

        assert sklearn_mse / klines_mse.mean() > 0.8
Esempio n. 3
0
    def test_scores(self):
        data = load_iris()
        X = data.data
        y = data.target
        k = len(np.unique(y))

        X_incomplete = create_incomplete_matrix(X)
            
        # X is the complete data matrix
        # X_incomplete has the same values as X except a subset have been replace with NaN
        labels, _, X_hat = kmeans_missing(X_incomplete, k)

        metrics.homogeneity_completeness_v_measure(labels, y)

        klines_mse_sklearn = ((X - X_hat)**2).mean()

        # ## Clustering using KLines
        displacements = np.nan_to_num(X_incomplete)

        N, _ = X_incomplete.shape
        spans = np.nan_to_num(X_incomplete)
        spans[spans==0] = 1
        spans[spans!=1] = 0

        L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))
        
        config = ParameterConfig()

        ## data
        k = 3
        m = min(int(N*0.1), 100)  # coreset size ~ reduction ratio
        tau = 1e-3

        config.a_b_approx_minimum_number_of_lines = min(int(N*0.1), 100) # constant 100, line 2, algo 2 BI-CRITERIA

        config.sample_size_for_a_b_approx = int(m*1.05) # |S| >= m, line 3 of algo 2
                                                        # note: there'll be a O(|S|^2) cost while computing algo 1
            
        config.farthest_to_centers_rate_in_a_b_approx = 0.25  # opp of 7/11, line 6, algo 2 BI-CRITERIA
        config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
        config.closest_to_median_rate = (1-tau)/(2*k)  # refer line 4, algo 1, other paper

        config.median_sample_size = int(N*0.05)    # size of q_i, line 3, algo 2, other paper
        config.max_sensitivity_multiply_factor = 2  # for outliers in coresets
        config.number_of_remains = min(int(N*0.05), 20)


        def util():
            _, B, _ = CorsetForKMeansForLines(config).coreset(L, k, m, True)

            # reduce the coreset of random centers
            cwc = CoresetForWeightedCenters(config)
            MAX_ITER = 5
            while MAX_ITER > 0:        
                B = cwc.coreset(B, k, m)
                if B.get_size() <= k:
                    MAX_ITER = 0
                MAX_ITER -= 1 
            
            X_klines = L.get_projected_centers(B)
            kl_labels = L.get_indices_clusters(B)
            return X_klines, kl_labels

        klines_mse = []
        scores = []
        ITER = 10
        for i in range(ITER):
            X_klines, kl_labels = util()
            klines_mse.append(((X - X_klines)**2).mean())
            scores.append(metrics.homogeneity_completeness_v_measure(kl_labels, y))

        assert klines_mse_sklearn/np.array(klines_mse).mean() < 0.6