def test_compute_full_tree(): """Test that the full tree is computed if n_clusters is small""" rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert_equal(n_nodes, n_samples - 1) # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert_equal(n_nodes, n_samples - n_clusters)
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_propagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ X = np.array([ (.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) connectivity = kneighbors_graph(X, 10) ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array( [ (0.014, 0.120), (0.014, 0.099), (0.014, 0.097), (0.017, 0.153), (0.017, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.152), (0.018, 0.149), (0.018, 0.144), ] ) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage="ward") # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering( connectivity=partial(kneighbors_graph, n_neighbors=3)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_propagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144), ]) connectivity = kneighbors_graph(X, 10) ward = AgglomerativeClustering( n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def main(): # parameters from paper params = namedtuple('args', [ 'num_learner', 'num_clusters', 'num_threads', 'SVP_neigh', 'out_dim', 'w_thresh', 'sp_thresh', 'cost', 'NNtest', 'normalize' ]) params.num_learners = 1 # 1 params.num_clusters = 1 # 1 params.num_threads = 32 params.SVP_neigh = 250 params.out_Dim = 100 params.w_thresh = 0.01 # ? params.sp_thresh = 0.01 # ? params.NNtest = 25 params.normalize = 1 # ? params.regressor_lambda1 = 1e-6 params.regressor_lambda2 = 1e-3 params.embedding_lambda = 0.1 # determined automatically in WAltMin_asymm.m train_X, train_Y, test_X, test_Y = load_input() clusterings = [] for i in range(params.num_learners): model = KMeans(n_clusters=params.num_clusters, n_jobs=-1, n_init=8, max_iter=100) model.fit(train_X) clusterings.append(model) learners = [] for clus_model in tqdm(clusterings): models = [] for i in range(clus_model.n_clusters): # for each cluster in each learner # learn a model data_idx = np.nonzero(clus_model.labels_ == i)[0] X = train_X[data_idx, :] Y = train_Y[data_idx, :] print('embedding learning: building kNN graph') # build the kNN graph graph = kneighbors_graph(Y, params.SVP_neigh, mode='distance', metric='cosine', include_self=True, n_jobs=-1) graph.data = 1 - graph.data # convert to similarity print('embedding learning: ALS') # learn the local embedding als_model = implicit.als.AlternatingLeastSquares( factors=params.out_Dim, regularization=params.embedding_lambda) als_model.fit(graph) # the embedding # shape: #instances x embedding dim Z = als_model.item_factors print('linear regressor training') # learn the linear regressor if True: # regressor = Ridge(fit_intercept=True, alpha=params.regressor_lambda2) regressor = ElasticNet(alpha=0.1, l1_ratio=0.001) regressor.fit(X, Z) # shape: embedding dim x feature dim V = regressor.coef_ else: # learn V with l2 on V and l1 on VX ## note that X is sparse V = learn_V(X.toarray(), Z, lambda1=params.regressor_lambda1, lambda2=params.regressor_lambda2, iter_max=200, print_log=True) # the nearest neighbour model fitted_Z = X.toarray() @ V.T Z_neighbors = NearestNeighbors(n_neighbors=params.NNtest, algorithm='kd_tree').fit(fitted_Z) projected_center = project(V, clus_model.cluster_centers_[i]) learned = { 'center_z': projected_center, 'V': V, 'Z_neighbors': Z_neighbors, 'data_idx': data_idx } models.append(learned) learners.append(models) models = [Model(learner, train_Y) for learner in learners] ensemble = Ensemble(models) # predict pred_Y = ensemble.predict_many(test_X) performance = precision_at_ks(test_Y, pred_Y) # evaluation # precision@k for k, s in performance.items(): print('precision@{}: {:.4f}'.format(k, s)) # LRAP print(label_ranking_average_precision_score(test_Y.toarray(), pred_Y))
learners = [] for clus_model in tqdm(clusterings): models = [] for i in range(clus_model.n_clusters): # for each cluster in each learner # learn a model data_idx = np.nonzero(clus_model.labels_ == i)[0] X = train_X[data_idx, :] Y = train_Y[data_idx, :] print('embedding learning: building kNN graph') # build the kNN graph graph = kneighbors_graph(Y, params.SVP_neigh, mode='distance', metric='cosine', include_self=True, n_jobs=-1) graph.data = 1 - graph.data # convert to similarity print('embedding learning: ALS') # learn the local embedding als_model = implicit.als.AlternatingLeastSquares(factors=params.out_Dim, regularization=params.embedding_lambda) als_model.fit(graph) # the embedding # shape: #instances x embedding dim Z = als_model.item_factors print('linear regressor training') # learn the linear regressor