def _get_optimal_number_of_clusters(self, correlation, asset_returns, linkage, num_reference_datasets=5): """ Find the optimal number of clusters for hierarchical clustering using the Gap statistic. :param correlation: (np.array) Matrix of asset correlations. :param asset_returns: (pd.DataFrame) Historical asset returns. :param linkage: (str) The type of linkage method to use for clustering. :param num_reference_datasets: (int) The number of reference datasets to generate for calculating expected inertia. :return: (int) The optimal number of clusters. """ original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5)) gap_values = [] num_clusters = 1 max_number_of_clusters = float("-inf") while True: # Calculate inertia from original data original_clusters = scipy_linkage(squareform(original_distance_matrix), method=linkage) original_cluster_assignments = fcluster(original_clusters, num_clusters, criterion='maxclust') if max(original_cluster_assignments) == max_number_of_clusters or max(original_cluster_assignments) > 10: break max_number_of_clusters = max(original_cluster_assignments) inertia = self._compute_cluster_inertia(original_cluster_assignments, asset_returns.values) # Calculate expected inertia from reference datasets expected_inertia = self._calculate_expected_inertia(num_reference_datasets, asset_returns, num_clusters, linkage) # Calculate the gap statistic gap = expected_inertia - inertia gap_values.append(gap) num_clusters += 1 return 1 + np.argmax(gap_values)
def cluster_results(self, method_name, dist_matrix): if method_name is None or method_name == "": method_name = "ward" cond_dist_matrix = squareform(dist_matrix) linkage_inf = scipy_linkage(cond_dist_matrix, method=method_name) clusters = fcluster(linkage_inf, t=self.epsilon, depth=5) return clusters
def _calculate_expected_inertia(self, num_reference_datasets, asset_returns, num_clusters, linkage): """ Calculate the expected inertia by generating clusters from a uniform distribution. :param num_reference_datasets: (int) The number of reference datasets to generate from the distribution. :param asset_returns: (pd.DataFrame) Historical asset returns. :param num_clusters: (int) The number of clusters to generate. :param linkage: (str) The type of linkage criterion to use for hierarchical clustering. :return: (float) The expected inertia from the reference datasets. """ reference_inertias = [] for _ in range(num_reference_datasets): # Generate reference returns from uniform distribution and calculate the distance matrix. reference_asset_returns = pd.DataFrame( np.random.rand(*asset_returns.shape)) reference_correlation = np.array(reference_asset_returns.corr()) reference_distance_matrix = np.sqrt( 2 * (1 - reference_correlation).round(5)) reference_clusters = scipy_linkage( squareform(reference_distance_matrix), method=linkage) reference_cluster_assignments = fcluster(reference_clusters, num_clusters, criterion='maxclust') inertia = self._compute_cluster_inertia( reference_cluster_assignments, reference_asset_returns.values) reference_inertias.append(inertia) return np.mean(reference_inertias)
def linkage(path_to_submissions_directory, method): output_directory = get_output_directory(path_to_submissions_directory) edit_distance_file = os.path.join(output_directory, EDITDISTANCE_NAME) comparison_table = np.load(edit_distance_file) dists = squareform(comparison_table) linkage_matrix = scipy_linkage(dists, method, optimal_ordering=True) output_file = os.path.join(output_directory, LINKAGE_NAME) np.save(output_file, linkage_matrix) click.echo(f'Saved as \'{output_file}\'.')
def _tree_clustering(distance, method='single'): """ Perform the traditional heirarchical tree clustering. :param correlation: (np.array) Correlation matrix of the assets :param method: (str) The type of clustering to be done :return: (np.array) Distance matrix and clusters """ clusters = scipy_linkage(squareform(distance.values), method=method) return clusters
def _tree_clustering(self, correlation, linkage): """ Perform agglomerative clustering on the current portfolio. :param correlation: (np.array) Matrix of asset correlations. :param linkage (str): The type of linkage method to use for clustering. :return: (list) Structure of hierarchical tree. """ distance_matrix = np.sqrt(2 * (1 - correlation).round(5)) clusters = scipy_linkage(squareform(distance_matrix.values), method=linkage) clustering_inds = fcluster(clusters, self.optimal_num_clusters, criterion='maxclust') cluster_children = {index - 1: [] for index in range(min(clustering_inds), max(clustering_inds) + 1)} for index, cluster_index in enumerate(clustering_inds): cluster_children[cluster_index - 1].append(index) return clusters, cluster_children
def _check_max_number_of_clusters(num_clusters, linkage, correlation): """ In some cases, the optimal number of clusters value given by the users is greater than the maximum number of clusters possible with the given data. This function checks this and assigns the proper value to the number of clusters when the given value exceeds maximum possible clusters. :param num_clusters: (int) The number of clusters. :param linkage (str): The type of linkage method to use for clustering. :param correlation: (np.array) Matrix of asset correlations. :return: (int) New value for number of clusters. """ distance_matrix = np.sqrt(2 * (1 - correlation).round(5)) clusters = scipy_linkage(squareform(distance_matrix.values), method=linkage) clustering_inds = fcluster(clusters, num_clusters, criterion='maxclust') max_number_of_clusters_possible = max(clustering_inds) num_clusters = min(max_number_of_clusters_possible, num_clusters) return num_clusters
def dendrogram(self, X=None, labels=None, leaf_rotation=90, leaf_font_size=12, orientation='top', show_contracted=True, max_d=None, showfig=True, metric=None, linkage=None, truncate_mode=None, figsize=(15, 10)): """Plot Dendrogram. Parameters ---------- X : numpy-array (default : None) Input data. labels : list, (default: None) Plot the labels. When None: the index of the original observation is used to label the leaf nodes. leaf_rotation : int, (default: 90) Rotation of the labels [0-360]. leaf_font_size : int, (default: 12) Font size labels. orientation : string, (default: 'top') Direction of the dendrogram: 'top', 'bottom', 'left' or 'right' show_contracted : bool, (default: True) The heights of non-singleton nodes contracted into a leaf node are plotted as crosses along the link connecting that leaf node. max_d : Float, (default: None) Height of the dendrogram to make a horizontal cut-off line. showfig : bool, (default = True) Plot the dendrogram. metric : str, (default: 'euclidean'). Distance measure for the clustering, such as 'euclidean','hamming', etc. linkage : str, (default: 'ward') Linkage type for the clustering. 'ward','single',',complete','average','weighted','centroid','median'. truncate_mode : string, (default: None) Truncation is used to condense the dendrogram, which can be based on: 'level', 'lastp' or None figsize : tuple, (default: (15, 10). Size of the figure (height,width). Returns ------- results : dict * labx : int : Cluster labels based on the input-ordering. * order_rows : string : Order of the cluster labels as presented in the dendrogram (left-to-right). * max_d : float : maximum distance to set the horizontal threshold line. * max_d_lower : float : maximum distance lowebound * max_d_upper : float : maximum distance upperbound """ if (self.results is None) or (self.results['labx'] is None): if self.verbose>=3: print('[clusteval] >No results to plot. Tip: try the .fit() function first.') return None # Set parameters no_plot = False if showfig else True max_d_lower, max_d_upper = None, None # Check whether if (metric is not None) and (linkage is not None) and (X is not None): if self.verbose>=2: print('[clusteval] >Compute dendrogram using metric=%s, linkage=%s' %(metric, linkage)) Z = scipy_linkage(X, method=linkage, metric=metric) elif (metric is not None) and (linkage is not None) and (X is None): if self.verbose>=2: print('[clusteval] >To compute the dendrogram, also provide the data: X=data <return>') return None elif (not hasattr(self, 'Z')): # Return if Z is not computed. if self.verbose>=3: print('[clusteval] >No results to plot. Tip: try the .fit() function (no kmeans) <return>') return None else: if self.verbose>=3: print('[clusteval] >Plotting the dendrogram with optimized settings: metric=%s, linkage=%s, max_d=%.3f. Be patient now..' %(self.metric, self.linkage, self.results['max_d'])) Z = self.Z metric = self.metric linkage = self.linkage if self.cluster=='kmeans': if self.verbose>=3: print('[clusteval] >No results to plot. Tip: try the .fit() function with metric that is different than kmeans <return>') return None if max_d is None: max_d = self.results['max_d'] max_d_lower = self.results['max_d_lower'] max_d_upper = self.results['max_d_upper'] # Make the dendrogram if showfig: fig, ax = plt.subplots(figsize=figsize) annotate_above = max_d results = plot_dendrogram(Z, labels=labels, leaf_rotation=leaf_rotation, leaf_font_size=leaf_font_size, orientation=orientation, show_contracted=show_contracted, annotate_above=annotate_above, max_d=max_d, truncate_mode=truncate_mode, ax=ax, no_plot=no_plot) # Compute cluster labels if self.verbose>=3: print('[clusteval] >Compute cluster labels.') labx = fcluster(Z, max_d, criterion='distance') # Store results results['order_rows'] = np.array(results['ivl']) results['labx'] = labx results['max_d'] = max_d results['max_d_lower'] = max_d_lower results['max_d_upper'] = max_d_upper results['ax'] = ax return results
def fit(self, X): """Cluster validation. Parameters ---------- X : Numpy-array. The rows are the features and the colums are the samples. Returns ------- dict. with various keys. Note that the underneath keys can change based on the used methodtype. method: str Method name that is used for cluster evaluation. score: pd.DataFrame() The scoring values per clusters. The methods [silhouette, dbindex] provide this information. labx: list Cluster labels. fig: list Relevant information to make the plot. """ if 'array' not in str(type(X)): raise ValueError('Input data must be of type numpy array') max_d, max_d_lower, max_d_upper = None, None, None self.Z = [] # Cluster using on metric/linkage if self.verbose>=3: print('\n[clusteval] >Fit using %s with metric: %s, and linkage: %s' %(self.cluster, self.metric, self.linkage)) # Compute linkages if self.cluster!='kmeans': self.Z = scipy_linkage(X, method=self.linkage, metric=self.metric) # Choosing method if (self.cluster=='agglomerative') or (self.cluster=='kmeans'): if self.method=='silhouette': self.results = silhouette.fit(X, Z=self.Z, cluster=self.cluster, metric=self.metric, min_clust=self.min_clust, max_clust=self.max_clust, savemem=self.savemem, verbose=self.verbose) elif self.method=='dbindex': self.results = dbindex.fit(X, Z=self.Z, metric=self.metric, min_clust=self.min_clust, max_clust=self.max_clust, savemem=self.savemem, verbose=self.verbose) elif self.method=='derivative': self.results = derivative.fit(X, Z=self.Z, cluster=self.cluster, metric=self.metric, min_clust=self.min_clust, max_clust=self.max_clust, verbose=self.verbose) elif (self.cluster=='dbscan') and (self.method=='silhouette'): self.results = dbscan.fit(X, eps=None, epsres=50, min_samples=0.01, metric=self.metric, norm=True, n_jobs=-1, min_clust=self.min_clust, max_clust=self.max_clust, verbose=self.verbose) elif self.cluster=='hdbscan': try: import clusteval.hdbscan as hdbscan self.results = hdbscan.fit(X, min_samples=0.01, metric=self.metric, norm=True, n_jobs=-1, min_clust=self.min_clust, verbose=self.verbose) except: raise ValueError('hdbscan must be installed manually. Try to: <pip install hdbscan> or <conda install -c conda-forge hdbscan>') else: raise ValueError('[clusteval] >The combination cluster"%s", method="%s" is not implemented.' %(self.cluster, self.method)) # Compute the dendrogram threshold if (self.cluster!='kmeans') and (len(np.unique(self.results['labx']))>1): # print(self.results['labx']) max_d, max_d_lower, max_d_upper = _compute_dendrogram_threshold(self.Z, self.results['labx'], verbose=self.verbose) # Return if self.results['labx'] is not None: if self.verbose>=3: print('[clusteval] >Optimal number clusters detected: [%.0d].' %(len(np.unique(self.results['labx'])))) if self.verbose>=3: print('[clusteval] >Fin.') self.results['max_d'] = max_d self.results['max_d_lower'] = max_d_lower self.results['max_d_upper'] = max_d_upper return self.results
def fit(X, cluster='agglomerative', metric='euclidean', linkage='ward', min_clust=2, max_clust=25, Z=None, savemem=False, verbose=3): """This function return the cluster labels for the optimal cutt-off based on the choosen hierarchical clustering method. Parameters ---------- X : Numpy-array, Where rows is features and colums are samples. cluster : str, (default: 'agglomerative') Clustering method type for clustering. * 'agglomerative' * 'kmeans' metric : str, (default: 'euclidean'). Distance measure for the clustering, such as 'euclidean','hamming', etc. linkage : str, (default: 'ward') Linkage type for the clustering. 'ward','single',',complete','average','weighted','centroid','median'. min_clust : int, (default: 2) Number of clusters that is evaluated greater or equals to min_clust. max_clust : int, (default: 25) Number of clusters that is evaluated smaller or equals to max_clust. savemem : bool, (default: False) Save memmory when working with large datasets. Note that htis option only in case of KMeans. Z : Object, (default: None). This will speed-up computation if you readily have Z. e.g., Z=linkage(X, method='ward', metric='euclidean'). verbose : int, optional (default: 3) Print message to screen [1-5]. The larger the number, the more information is returned. Returns ------- dict. with various keys. Note that the underneath keys can change based on the used methodtype. method: str Method name that is used for cluster evaluation. score: pd.DataFrame() The scoring values per clusters. labx: list Cluster labels. fig: list Relevant information to make the plot. Examples -------- >>> # Import library >>> import clusteval.silhouette as silhouette >>> from sklearn.datasets import make_blobs >>> >>> # Example 1: >>> Generate demo data >>> X, labels_true = make_blobs(n_samples=750, centers=5, n_features=10) >>> # Fit with default parameters >>> results = silhouette.fit(X) >>> # plot >>> silhouette.scatter(results, X) >>> silhouette.plot(results) >>> >>> # Example 2: >>> # Try also alternative dataset >>> X, labels_true = make_blobs(n_samples=750, centers=[[1, 1], [-1, -1], [1, -1], [-1, 1]], cluster_std=0.4,random_state=0) >>> # Fit with some specified parameters >>> results = silhouette.fit(X, metric='kmeans', savemem=True) >>> # plot >>> silhouette.scatter(results, X) >>> silhouette.plot(results) References ---------- http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html """ # Make dictionary to store Parameters Param = {} Param['verbose'] = verbose Param['cluster'] = cluster Param['metric'] = metric Param['linkage'] = linkage Param['min_clust'] = min_clust Param['max_clust'] = max_clust Param['savemem'] = savemem if verbose >= 3: print('[clusteval] >Evaluate using silhouette.') # Savemem for kmeans if Param['cluster'] == 'kmeans': if Param['savemem']: kmeansmodel = MiniBatchKMeans if Param['verbose'] >= 3: print( '[clusteval] >Save memory enabled for kmeans with method silhouette.' ) else: kmeansmodel = KMeans # Cluster hierarchical using on metric/linkage if (Z is None) and (Param['cluster'] != 'kmeans'): Z = scipy_linkage(X, method=Param['linkage'], metric=Param['metric']) # Setup storing parameters clustcutt = np.arange(Param['min_clust'], Param['max_clust']) silscores = np.zeros((len(clustcutt))) * np.nan sillclust = np.zeros((len(clustcutt))) * np.nan clustlabx = [] # Run over all cluster cutoffs for i in tqdm(range(len(clustcutt))): # Cut the dendrogram for i clusters if Param['cluster'] == 'kmeans': labx = kmeansmodel(n_clusters=clustcutt[i], verbose=0).fit(X).labels_ else: labx = fcluster(Z, clustcutt[i], criterion='maxclust') # Store labx for cluster-cut clustlabx.append(labx) # Store number of unique clusters sillclust[i] = len(np.unique(labx)) # Compute silhouette (can only be done if more then 1 cluster) if sillclust[i] > 1: silscores[i] = silhouette_score(X, labx) # Convert to array clustlabx = np.array(clustlabx) # Store only if agrees to restriction of input clusters number I1 = np.isnan(silscores) == False I2 = sillclust >= Param['min_clust'] I3 = sillclust <= Param['max_clust'] Iloc = I1 & I2 & I3 if verbose >= 5: print(clustlabx) print('Iloc: %s' % (str(Iloc))) print('silscores: %s' % (str(silscores))) print('sillclust: %s' % (str(sillclust))) print('clustlabx: %s' % (str(clustlabx))) if len(Iloc) > 0: # Get only clusters of interest silscores = silscores[Iloc] sillclust = sillclust[Iloc] clustlabx = clustlabx[Iloc, :] clustcutt = clustcutt[Iloc] idx = np.argmax(silscores) clustlabx = clustlabx[idx, :] - 1 else: if verbose >= 3: print('[clusteval] >No clusters detected.') # Store results results = {} results['method'] = 'silhouette' results['score'] = pd.DataFrame(np.array([sillclust, silscores]).T, columns=['clusters', 'score']) results['score']['clusters'] = results['score']['clusters'].astype(int) results['labx'] = clustlabx results['fig'] = {} results['fig']['silscores'] = silscores results['fig']['sillclust'] = sillclust results['fig']['clustcutt'] = clustcutt # Return return (results)
def fit(X, cluster='agglomerative', metric='euclidean', linkage='ward', min_clust=2, max_clust=25, Z=None, savemem=False, verbose=3): """ Determine optimal number of clusters using dbindex. Description ----------- This function return the cluster labels for the optimal cutt-off based on the choosen hierarchical clustering method. Parameters ---------- X : Numpy-array. The rows are the features and the colums are the samples. cluster : str, (default: 'agglomerative') Clustering method type for clustering. * 'agglomerative' * 'kmeans' metric : str, (default: 'euclidean'). Distance measure for the clustering, such as 'euclidean','hamming', etc. linkage : str, (default: 'ward') Linkage type for the clustering. 'ward','single',',complete','average','weighted','centroid','median'. min_clust : int, (default: 2) Minimum number of clusters (>=). max_clust : int, (default: 25) Maximum number of clusters (<=). Z : Object, (default: None). This will speed-up computation if you readily have Z. e.g., Z=linkage(X, method='ward', metric='euclidean'). savemem : bool, (default: False) Save memmory when working with large datasets. Note that htis option only in case of KMeans. verbose : int, optional (default: 3) Print message to screen [1-5]. The larger the number, the more information. Returns ------- dict. with various keys. Note that the underneath keys can change based on the used methodtype. method: str Method name that is used for cluster evaluation. score: pd.DataFrame() The scoring values per clusters. labx: list Cluster labels. fig: list Relevant information to make the plot. Examples -------- >>> # Import library >>> import clusteval.dbindex as dbindex >>> from sklearn.datasets import make_blobs >>> Generate demo data >>> X, labels_true = make_blobs(n_samples=750, centers=6, n_features=10) >>> # Fit with default parameters >>> results = dbindex.fit(X) >>> # plot >>> dbindex.plot(results) """ # Make dictionary to store Parameters Param = {} Param['verbose'] = verbose Param['cluster'] = cluster Param['metric'] = metric Param['linkage'] = linkage Param['min_clust'] = min_clust Param['max_clust'] = max_clust Param['savemem'] = savemem if verbose>=3: print('[clusteval] >Evaluate using dbindex.') # Savemem for kmeans if Param['cluster']=='kmeans': if Param['savemem']: kmeansmodel=MiniBatchKMeans print('[clusteval] >Save memory enabled for kmeans.') else: kmeansmodel=KMeans # Cluster hierarchical using on metric/linkage if (Z is None) and (Param['cluster']!='kmeans'): Z = scipy_linkage(X, method=Param['linkage'], metric=Param['metric']) # Setup storing parameters clustcutt = np.arange(Param['min_clust'], Param['max_clust']) scores = np.zeros((len(clustcutt))) * np.nan dbclust = np.zeros((len(clustcutt))) * np.nan clustlabx = [] # Run over all cluster cutoffs for i in tqdm(range(len(clustcutt))): # Cut the dendrogram for i clusters if Param['cluster']=='kmeans': labx=kmeansmodel(n_clusters=clustcutt[i], verbose=0).fit(X).labels_ else: labx = fcluster(Z, clustcutt[i], criterion='maxclust') # Store labx for cluster-cut clustlabx.append(labx) # Store number of unique clusters dbclust[i]=len(np.unique(labx)) # Compute silhouette (can only be done if more then 1 cluster) if dbclust[i]>1: scores[i]=_dbindex_score(X, labx) # Convert to array clustlabx = np.array(clustlabx) # Store only if agrees to restriction of input clusters number I1 = np.isnan(scores)==False I2 = dbclust>=Param['min_clust'] I3 = dbclust<=Param['max_clust'] Iloc = I1 & I2 & I3 # Get only clusters of interest if len(Iloc)>0: scores = scores[Iloc] dbclust = dbclust[Iloc] clustlabx = clustlabx[Iloc, :] clustcutt = clustcutt[Iloc] idx = np.argmin(scores) clustlabx = clustlabx[idx, :] - 1 else: if verbose>=3: print('[clusteval] >No clusters detected.') # Store results results = {} results['method'] = 'dbindex' results['score'] = pd.DataFrame(np.array([dbclust, scores]).T, columns=['clusters', 'score']) results['score'].clusters = results['score'].clusters.astype(int) results['labx'] = clustlabx results['fig'] = {} results['fig']['dbclust'] = dbclust results['fig']['scores'] = scores results['fig']['clustcutt'] = clustcutt # Return return(results)