def fit(self, X, y=None, features=None): """ Constructs DAG according to `self.dag_method`. Parameters ---------- X: `numpy.ndarray` or `scipy.sparse.csr_matrix` Matrix with rows corresponding to all of the samples that define the DAG and columns corresponding to features. y Ignored features: `numpy.ndarray` of `str` A list of strings with feature labels. """ if features is None: self.features = np.array(range(X.shape[1])) if self.reduce_dim is not None: if issubclass(type(self.reduce_dim), np.ndarray): X_ = self.reduce_dim elif isinstance(self.reduce_dim, int): X_ = reduce_dimensionality(X, dim_red_k=self.reduce_dim) else: raise ValueError('`reduce_dim` has invalid type {}'.format( type(self.reduce_dim))) else: X_ = X X_ = self.check_and_sketch(X_) if self.verbose: tprint('Constructing DAG...') if self.dag_method == 'agg_ward': from sklearn.cluster.hierarchical import ward_tree ret = ward_tree(X_, n_clusters=None, return_distance=True) children, n_components, n_leaves, parent, distances = ret assert (n_components == 1) self.create_dag_agg(children, X_.shape[0]) elif self.dag_method == 'louvain': self.create_dag_louvain(X_) else: raise ValueError('Invalid DAG construction method {}'.format( self.dag_method)) if len(self.sample_idx) != X.shape[0]: warnings.warn( 'Some samples have been orphaned during ' 'DAG construction, {} orphans detected'.format( X.shape[0] - len(self.sample_idx)), RuntimeWarning) return self
def cluster(clusterType, vectors, y): if (clusterType == "KMeans"): kclusterer = KMeansClusterer( NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25) assigned_clusters = kclusterer.cluster(vectors, assign_clusters=True) elif (clusterType == "GMM"): GMM = GaussianMixture(n_components=NUM_CLUSTERS) assigned_clusters = GMM.fit_predict(vectors) elif (clusterType == "SVM"): classifier = SVC(kernel='rbf', gamma='auto', random_state=0) #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) elif (clusterType == "T2VH"): ret = hierarchical.ward_tree(vectors, n_clusters=NUM_CLUSTERS) children = ret[0] n_leaves = ret[2] assigned_clusters = hierarchical._hc_cut(NUM_CLUSTERS, children, n_leaves) elif (clusterType == "RandomForest"): classifier = RandomForestClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "DecisionTree"): classifier = DecisionTreeClassifier() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) elif (clusterType == "LogisticRegression"): classifier = sklearn.linear_model.LogisticRegression() #cross-validation assigned_clusters = cross_validation(classifier, vectors, y) # classifier.fit(vectors, y) # assigned_clusters=classifier.predict(vectors) else: print(clusterType, " is not a predefined cluster type.") return return assigned_clusters
def compute_stability_fold(samples, train, test, method='ward', max_k=None, stack=False, stability=True, cv_likelihood=False, corr_score=None, ground_truth=None, n_neighbors=1, **kwargs): """ General function to compute the stability on a cross-validation fold. Parameters: ----------- samples : list of arrays List of arrays containing the samples to cluster, each array has shape (n_samples, n_features) in PyMVPA terminology. We are clustering the features, i.e., the nodes. train : list or array Indices for the training set. test : list or array Indices for the test set. method : {'complete', 'gmm', 'kmeans', 'ward'} Clustering method to use. Default is 'ward'. max_k : int or None Maximum k to compute the stability testing, starting from 2. By default it will compute up to the maximum possible k, i.e., the number of points. stack : bool Whether to stack or average the datasets. Default is False, meaning that the datasets are averaged by default. stability : bool Whether to compute the stability measure described in Lange et al., 2004. Default is True. cv_likelihood : bool Whether to compute the cross-validated likelihood for mixture model; only valid if 'gmm' method is used. Default is False. corr_score : {'pearson','spearman'} or None Whether to compute the specified type of correlation score. Default is None. ground_truth : array or None Array containing the ground truth of the clustering of the data, useful to compare stability against ground truth for simulations. n_neighbors : int Number of neighbors to use to predict clustering solution on test set using K-nearest neighbors. Currently used only for methods `complete` and `ward`. Default is 1. kwargs : optional Keyword arguments being passed to the clustering method (only for 'ward' and 'gmm'). Returns: -------- ks : array A (max_k-1,) array, where ks[i] is the `k` of the clustering solution for iteration `i`. ari : array A (max_k-1,) array, where ari[i] is the Adjusted Rand Index of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. ami : array A (max_k-1,) array, where ari[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set and the actual clustering solution of the test set for `k` of ks[i]. stab : array or None A (max_k-1,) array, where stab[i] is the stability measure described in Lange et al., 2004 for `k` of ks[i]. Note that this measure is the un-normalized one. It will be normalized later in the process. likelihood : array or None If method is 'gmm' and cv_likelihood is True, a (max_k-1,) array, where likelihood[i] is the cross-validated likelihood of the GMM clustering solution for `k` of ks[i]. Otherwise returns None. ari_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ari_gt[i] is the Adjusted Rand Index of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. ami_gt : array or None If ground_truth is not None, a (max_k-1,) array, where ami_gt[i] is the Adjusted Mutual Information of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. stab_gt : array or None If ground_truth is not None, a (max_k-1,) array, where stab_gt[i] is the stability measure of the predicted clustering solution on the test set for `k` of ks[i] and the ground truth clusters of the data. Otherwise returns None. corr : array or None Average correlation for each fold. TODO corr_gt : array or None Avg correlation against GT. TODO """ if method not in AVAILABLE_METHODS: raise ValueError('Method {0} not implemented'.format(method)) if cv_likelihood and method != 'gmm': raise ValueError( "Cross-validated likelihood is only available for 'gmm' method") # if max_k is None, set max_k to maximum value if not max_k: max_k = samples[0].shape[1] # preallocate arrays for results ks = np.zeros(max_k-1, dtype=int) ari = np.zeros(max_k-1) ami = np.zeros(max_k-1) if stability: stab = np.zeros(max_k-1) if cv_likelihood: likelihood = np.zeros(max_k-1) if corr_score is not None: corr = np.zeros(max_k-1) if ground_truth is not None: ari_gt = np.zeros(max_k-1) ami_gt = np.zeros(max_k-1) if stability: stab_gt = np.zeros(max_k-1) if corr_score is not None: corr_gt = np.zeros(max_k-1) # get training and test train_set = [samples[x] for x in train] test_set = [samples[x] for x in test] if stack: train_ds = np.vstack(train_set) test_ds = np.vstack(test_set) else: train_ds = np.mean(np.dstack(train_set), axis=2) test_ds = np.mean(np.dstack(test_set), axis=2) # compute clustering on training set if method == 'complete': train_ds_dist = pdist(train_ds.T, metric='correlation') test_ds_dist = pdist(test_ds.T, metric='correlation') # I'm computing the full tree and then cutting # afterwards to speed computation Y_train = complete(train_ds_dist) # same on testing set Y_test = complete(test_ds_dist) elif method == 'ward': (children_train, n_comp_train, n_leaves_train, parents_train) = ward_tree(train_ds.T, **kwargs) # same on testing set (children_test, n_comp_test, n_leaves_test, parents_test) = ward_tree(test_ds.T, **kwargs) elif method == 'gmm' or method == 'kmeans': pass # we'll have to run it for each k else: raise ValueError("We shouldn't get here") for i_k, k in enumerate(range(2, max_k+1)): if method == 'complete': # cut the tree with right K for both train and test train_label = cut_tree_scipy(Y_train, k) test_label = cut_tree_scipy(Y_test, k) # train a classifier on this clustering knn = KNeighborsClassifier(#algorithm='brute', # metric='correlation', n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'ward': # cut the tree with right K for both train and test train_label = _hc_cut(k, children_train, n_leaves_train) test_label = _hc_cut(k, children_test, n_leaves_test) # train a classifier on this clustering knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(train_ds.T, train_label) # predict the clusters in the test set prediction_label = knn.predict(test_ds.T) elif method == 'gmm': gmm = GMM(n_components=k, **kwargs) # fit on train and predict test gmm.fit(train_ds.T) prediction_label = gmm.predict(test_ds.T) if cv_likelihood: log_prob = np.sum(gmm.score(test_ds.T)) # fit on test and get labels gmm.fit(test_ds.T) test_label = gmm.predict(test_ds.T) elif method == 'kmeans': kmeans = KMeans(n_clusters=k) # fit on train and predict test kmeans.fit(train_ds.T) prediction_label = kmeans.predict(test_ds.T) # fit on test and get labels kmeans.fit(test_ds.T) test_label = kmeans.predict(test_ds.T) else: raise ValueError("We shouldn't get here") # append results ks[i_k] = k ari[i_k] = adjusted_rand_score(prediction_label, test_label) ami[i_k] = adjusted_mutual_info_score(prediction_label, test_label) if stability: stab[i_k] = stability_score(prediction_label, test_label, k) if cv_likelihood: likelihood[i_k] = log_prob if corr_score is not None: corr[i_k] = correlation_score(prediction_label, test_label, test_ds, corr_score) if ground_truth is not None: ari_gt[i_k] = adjusted_rand_score(prediction_label, ground_truth) ami_gt[i_k] = adjusted_mutual_info_score(prediction_label, ground_truth) if stability: stab_gt[i_k] = stability_score(prediction_label, ground_truth, k) if corr_score is not None: corr_gt[i_k] = correlation_score(prediction_label, ground_truth, test_ds, corr_score) results = [ks, ari, ami] if stability: results.append(stab) else: results.append(None) if cv_likelihood: results.append(likelihood) else: results.append(None) if ground_truth is not None: results += [ari_gt, ami_gt] else: results += [None, None] if stability and ground_truth is not None: results.append(stab_gt) else: results.append(None) if corr_score is not None: results.append(corr) else: results.append(None) if corr_score is not None and ground_truth is not None: results.append(corr_gt) else: results.append(None) return results
#a = a[order] #b = b[order] #height = height[order] if 1: import pylab as pl children = np.c_[a, b].astype(np.int) from sklearn.cluster.hierarchical import _hc_cut, ward_tree labels = _hc_cut(n_clusters=4, children=children, n_leaves=N) pl.figure(1) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels, cmap=pl.cm.spectral) pl.title('Complete linkage') if 1: from scipy.cluster import hierarchy children_s = hierarchy.complete(X)[:, :2].astype(np.int) labels_s = _hc_cut(n_clusters=4, children=children_s, n_leaves=N) import pylab as pl pl.figure(0) pl.clf() pl.scatter(X[:, 0], X[:, 1], c=labels_s, cmap=pl.cm.spectral) pl.title('Complete linkage (scipy)') if 0: pl.figure(2) pl.clf() children_w, _, _ = ward_tree(X) labels_w = _hc_cut(n_clusters=4, children=children_w, n_leaves=N) pl.scatter(X[:, 0], X[:, 1], c=labels_w, cmap=pl.cm.spectral) pl.title('Ward') pl.show()