def linkage(KL,H, toM, SHOW=True): linkage_matrix = linkage_scipy(KL, "complete") labels = [(i,toM[i]) for i in toM] labels.sort() labels = [y for x,y in labels] ddata = dendrogram(linkage_matrix,labels=labels, color_threshold=0,leaf_rotation= 85, leaf_font_size=9 ) if SHOW: plt.tight_layout() plt.show() return ddata
def cluster_PSSMS(D): DIMS = [m.shape[0] for m in D.values() ] for DIM in range(min(DIMS), max(DIMS)): X = list() IDS = list() for d in D: if D[d].shape[0]==DIM: IDS.append(d) X.append(D[d]) if X: A = np.zeros((len(X), len(X))) for i in range(len(X)): for j in range(i+1, len(X)): A[i,j] = np.sum(abs(X[i] - X[j])) A[j,i] = A[i,j] linkage_matrix = linkage_scipy(A, "complete") ddata = dendrogram(linkage_matrix, labels=IDS , color_threshold=0,leaf_rotation= 85, leaf_font_size=14 ) plt.tight_layout() plt.show()
def fit(X, cluster='agglomerative', metric='euclidean', linkage='ward', min_clust=2, max_clust=25, Z=None, verbose=3): """ Determine optimal number of clusters using dbindex. Description ----------- This function return the cluster labels for the optimal cutt-off based on the choosen hierarchical clustering method. Parameters ---------- X : Numpy-array. The rows are the features and the colums are the samples. cluster : str, (default: 'agglomerative') Clustering method type for clustering. * 'agglomerative' * 'kmeans' metric : str, (default: 'euclidean'). Distance measure for the clustering, such as 'euclidean','hamming', etc. linkage : str, (default: 'ward') Linkage type for the clustering. 'ward','single',',complete','average','weighted','centroid','median'. min_clust : int, (default: 2) Number of clusters that is evaluated greater or equals to min_clust. max_clust : int, (default: 25) Number of clusters that is evaluated smaller or equals to max_clust. Z : Object, (default: None). This will speed-up computation if you readily have Z. e.g., Z=linkage(X, method='ward', metric='euclidean'). verbose : int, optional (default: 3) Print message to screen [1-5]. The larger the number, the more information. Returns ------- dict. with various keys. Note that the underneath keys can change based on the used methodtype. method: str Method name that is used for cluster evaluation. score: None Nothing in here but incuded for consistency labx: list Cluster labels. fig: list Relevant information to make the plot. Examples -------- >>> # Import library >>> import clusteval.derivative as derivative >>> from sklearn.datasets import make_blobs >>> Generate demo data >>> X, labels_true = make_blobs(n_samples=750, centers=6, n_features=10) >>> # Fit with default parameters >>> results = derivative.fit(X) >>> # plot >>> derivative.plot(results) """ Param = {} Param['verbose'] = verbose Param['cluster'] = cluster Param['metric'] = metric Param['linkage'] = linkage Param['min_clust'] = min_clust Param['max_clust'] = max_clust if verbose >= 3: print('[clusteval] >Evaluate using derivatives.') if Param['cluster'] == 'kmeans': if verbose >= 3: print('[clusteval] >Does not work with Kmeans! <return>') results = {} results['method'] = 'derivative' results['labx'] = None results['score'] = None results['fig'] = {} results['fig']['last_rev'] = None results['fig']['acceleration_rev'] = None return results # Cluster hierarchical using on metric/linkage if Z is None: Z = linkage_scipy(X, method=Param['linkage'], metric=Param['metric']) # Make all possible cluster-cut-offs if Param['verbose'] >= 3: print('[clusteval] >Determining optimal clustering by derivatives..') # Run over all cluster cutoffs last = Z[-10:, 2] last_rev = last[::-1] acceleration = np.diff(last, 2) # 2nd derivative of the distances acceleration_rev = acceleration[::-1] # Only focus on the min-max clusters acceleration_rev[:Param['min_clust']] = 0 acceleration_rev[Param['max_clust']:] = 0 last_rev[:Param['min_clust']] = 0 last_rev[Param['max_clust']:] = 0 k = acceleration_rev.argmax( ) + 2 # if idx 0 is the max of this we want 2 clusters if Param['verbose'] >= 3: print('[clusteval] >Clusters: %d' % k) # Now use the optimal cluster cut-off for the selection of clusters clustlabx = fcluster(Z, k, criterion='maxclust') # Convert to array clustlabx = np.array(clustlabx) # Store results results = {} results['method'] = 'derivative' results['labx'] = clustlabx results['score'] = None results['fig'] = {} results['fig']['last_rev'] = last_rev results['fig']['acceleration_rev'] = acceleration_rev # Return return (results)