Esempio n. 1
0
def test_compute_full_tree():
    """Test that the full tree is computed if n_clusters is small"""
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)
    connectivity = kneighbors_graph(X, 5, include_self=False)

    # When n_clusters is less, the full tree should be built
    # that is the number of merges should be n_samples - 1
    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - 1)

    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
    # we should stop when there are n_clusters.
    n_clusters = 101
    X = rng.randn(200, 2)
    connectivity = kneighbors_graph(X, 10, include_self=False)
    agc = AgglomerativeClustering(n_clusters=n_clusters,
                                  connectivity=connectivity)
    agc.fit(X)
    n_samples = X.shape[0]
    n_nodes = agc.children_.shape[0]
    assert_equal(n_nodes, n_samples - n_clusters)
Esempio n. 2
0
def test_connectivity_ignores_diagonal():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
Esempio n. 3
0
def test_connectivity_propagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    X = np.array([
        (.014, .120),
        (.014, .099),
        (.014, .097),
        (.017, .153),
        (.017, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .153),
        (.018, .152),
        (.018, .149),
        (.018, .144),
    ])
    connectivity = kneighbors_graph(X, 10)
    ward = AgglomerativeClustering(n_clusters=4,
                                   connectivity=connectivity,
                                   linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
def test_connectivity_propagation():
    # Check that connectivity in the ward tree is propagated correctly during
    # merging.
    X = np.array(
        [
            (0.014, 0.120),
            (0.014, 0.099),
            (0.014, 0.097),
            (0.017, 0.153),
            (0.017, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.153),
            (0.018, 0.152),
            (0.018, 0.149),
            (0.018, 0.144),
        ]
    )
    connectivity = kneighbors_graph(X, 10, include_self=False)
    ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage="ward")
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3, include_self=False)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
Esempio n. 6
0
def test_connectivity_callable():
    rng = np.random.RandomState(0)
    X = rng.rand(20, 5)
    connectivity = kneighbors_graph(X, 3)
    aglc1 = AgglomerativeClustering(connectivity=connectivity)
    aglc2 = AgglomerativeClustering(
        connectivity=partial(kneighbors_graph, n_neighbors=3))
    aglc1.fit(X)
    aglc2.fit(X)
    assert_array_equal(aglc1.labels_, aglc2.labels_)
Esempio n. 7
0
def test_connectivity_propagation():
    """
    Check that connectivity in the ward tree is propagated correctly during
    merging.
    """
    X = np.array([(.014, .120), (.014, .099), (.014, .097),
                  (.017, .153), (.017, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .153), (.018, .153), (.018, .153),
                  (.018, .152), (.018, .149), (.018, .144),
                  ])
    connectivity = kneighbors_graph(X, 10)
    ward = AgglomerativeClustering(
        n_clusters=4, connectivity=connectivity, linkage='ward')
    # If changes are not propagated correctly, fit crashes with an
    # IndexError
    ward.fit(X)
Esempio n. 8
0
def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2],
                  [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X, connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def test_identical_points():
    # Ensure identical points are handled correctly when using mst with
    # a sparse connectivity matrix
    X = np.array([[0, 0, 0], [0, 0, 0],
                  [1, 1, 1], [1, 1, 1],
                  [2, 2, 2], [2, 2, 2]])
    true_labels = np.array([0, 0, 1, 1, 2, 2])
    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    connectivity, n_components = _fix_connectivity(X,
                                                   connectivity,
                                                   'euclidean')

    for linkage in ('single', 'average', 'average', 'ward'):
        clustering = AgglomerativeClustering(n_clusters=3,
                                             linkage=linkage,
                                             connectivity=connectivity)
        clustering.fit(X)

        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         true_labels), 1)
Esempio n. 10
0
def main():
    # parameters from paper
    params = namedtuple('args', [
        'num_learner', 'num_clusters', 'num_threads', 'SVP_neigh', 'out_dim',
        'w_thresh', 'sp_thresh', 'cost', 'NNtest', 'normalize'
    ])
    params.num_learners = 1  # 1
    params.num_clusters = 1  # 1
    params.num_threads = 32
    params.SVP_neigh = 250
    params.out_Dim = 100
    params.w_thresh = 0.01  # ?
    params.sp_thresh = 0.01  # ?
    params.NNtest = 25
    params.normalize = 1  # ?
    params.regressor_lambda1 = 1e-6
    params.regressor_lambda2 = 1e-3
    params.embedding_lambda = 0.1  # determined automatically in WAltMin_asymm.m

    train_X, train_Y, test_X, test_Y = load_input()

    clusterings = []
    for i in range(params.num_learners):
        model = KMeans(n_clusters=params.num_clusters,
                       n_jobs=-1,
                       n_init=8,
                       max_iter=100)
        model.fit(train_X)
        clusterings.append(model)

    learners = []
    for clus_model in tqdm(clusterings):
        models = []
        for i in range(clus_model.n_clusters):
            # for each cluster in each learner
            # learn a model

            data_idx = np.nonzero(clus_model.labels_ == i)[0]
            X = train_X[data_idx, :]
            Y = train_Y[data_idx, :]

            print('embedding learning: building kNN graph')
            # build the kNN graph
            graph = kneighbors_graph(Y,
                                     params.SVP_neigh,
                                     mode='distance',
                                     metric='cosine',
                                     include_self=True,
                                     n_jobs=-1)
            graph.data = 1 - graph.data  # convert to similarity

            print('embedding learning: ALS')
            # learn the local embedding
            als_model = implicit.als.AlternatingLeastSquares(
                factors=params.out_Dim, regularization=params.embedding_lambda)
            als_model.fit(graph)

            # the embedding
            # shape: #instances x embedding dim
            Z = als_model.item_factors

            print('linear regressor training')
            # learn the linear regressor
            if True:
                # regressor = Ridge(fit_intercept=True, alpha=params.regressor_lambda2)
                regressor = ElasticNet(alpha=0.1, l1_ratio=0.001)
                regressor.fit(X, Z)
                # shape: embedding dim x feature dim
                V = regressor.coef_
            else:
                # learn V with l2 on V and l1 on VX
                ## note that X is sparse
                V = learn_V(X.toarray(),
                            Z,
                            lambda1=params.regressor_lambda1,
                            lambda2=params.regressor_lambda2,
                            iter_max=200,
                            print_log=True)
            # the nearest neighbour model
            fitted_Z = X.toarray() @ V.T

            Z_neighbors = NearestNeighbors(n_neighbors=params.NNtest,
                                           algorithm='kd_tree').fit(fitted_Z)

            projected_center = project(V, clus_model.cluster_centers_[i])
            learned = {
                'center_z': projected_center,
                'V': V,
                'Z_neighbors': Z_neighbors,
                'data_idx': data_idx
            }
            models.append(learned)
        learners.append(models)

    models = [Model(learner, train_Y) for learner in learners]
    ensemble = Ensemble(models)

    # predict
    pred_Y = ensemble.predict_many(test_X)
    performance = precision_at_ks(test_Y, pred_Y)

    # evaluation
    # precision@k
    for k, s in performance.items():
        print('precision@{}: {:.4f}'.format(k, s))
    # LRAP
    print(label_ranking_average_precision_score(test_Y.toarray(), pred_Y))
Esempio n. 11
0
learners = []
for clus_model in tqdm(clusterings):
    models = []
    for i in range(clus_model.n_clusters):
        # for each cluster in each learner
        # learn a model
        
        data_idx = np.nonzero(clus_model.labels_ == i)[0]
        X = train_X[data_idx, :]
        Y = train_Y[data_idx, :]        

        print('embedding learning: building kNN graph')
        # build the kNN graph
        graph = kneighbors_graph(Y, params.SVP_neigh, mode='distance', metric='cosine',
                                 include_self=True,                        
                                 n_jobs=-1)
        graph.data = 1 - graph.data  # convert to similarity
        
        print('embedding learning: ALS')
        # learn the local embedding
        als_model = implicit.als.AlternatingLeastSquares(factors=params.out_Dim,
                                                         regularization=params.embedding_lambda)
        als_model.fit(graph) 
        
        # the embedding
        # shape: #instances x embedding dim
        Z = als_model.item_factors                
        
        print('linear regressor training')
        # learn the linear regressor