class ClusterizationModel: """Model for clusterization Parameters ----------- n_clusters : integer, optional The dimension of the projection subspace. Attributes ---------- n_clusters : int, number of classes labels : list of int Labels of each point """ def __init__(self, n_clusters=8, model="agglomerative", **kwargs): self.n_clusters = n_clusters self.labels = [] self.X = [] self.model_name = model if model == "hierarchy": self.base_model = HierarchyModel(n_clusters, **kwargs) elif model == "KMeans": self.base_model = KMeans(n_clusters, **kwargs) elif model == "agglomerative": self.base_model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters, **kwargs) elif model == "dbscan": self.base_model = DBSCAN(**kwargs) else: self.base_model = cluster.SpectralClustering(n_clusters, **kwargs) self.model_name = "SpectralClustering" def _preproc_data(self, X): if isinstance(X, pd.DataFrame): return X.as_matrix() return X def fit(self, x, y=None): """Creates an affinity matrix for X using the selected affinity, then applies spectral clustering to this affinity matrix. Parameters ---------- x : The input samples, shape = [n_samples, n_features] Returns ------- self : object Returns self. """ self.X = x self.base_model.fit(self._preproc_data(x), y) self.labels = self.base_model.labels_ n_clusters_ = len(set(self.labels)) - (1 if -1 in self.labels else 0) if n_clusters_ != self.n_clusters: logging.warning("Clustering model provides different cluster count than expected: %s instead of %s" % ( n_clusters_, self.n_clusters)) self.n_clusters = n_clusters_ return self def get_mean_values(self): data = pd.DataFrame(self.X) data['label'] = pd.Series(self.labels) result = data.groupby('label').mean() result['Cluster size'] = data.groupby('label').count().iloc[:, 0] return result def get_labels(self): return self.labels def get_silhouette_score(self): from sklearn.metrics import silhouette_score return silhouette_score(self.X, self.labels, metric='euclidean') def draw_clusters(self, method=None, title=None, axis=None, show=True, **kwargs): data = self._preproc_data(self.X) reduced_data = PCA(n_components=2).fit_transform(data) if axis is None: draw_obj = plt else: draw_obj = axis if title is None: title = self.model_name + " %s clusters total" % self.n_clusters if method == "areas": # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 # Step size of the mesh. Decrease to increase the quality of the VQ. parts_n = kwargs.pop("parts_n", 10) h_x = (x_max - x_min) / parts_n # point in the mesh [x_min, m_max]x[y_min, y_max]. h_y = (y_max - y_min) / parts_n # point in the mesh [x_min, m_max]x[y_min, y_max]. xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y)) neighbors_classifier = KNeighborsClassifier().fit(reduced_data, self.labels) # Obtain labels for each point in mesh. Use last trained model. Z = neighbors_classifier.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) draw_obj.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') draw_obj.plot(reduced_data[:, 0], reduced_data[:, 1], 'o', markersize=9) if "cluster_centers_" in self.base_model.__dict__: # Plot the centroids as a white X centroids = self.base_model.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=9, color='w', zorder=10) # plt.xlim(x_min, x_max) # plt.ylim(y_min, y_max) # plt.xticks(()) # plt.yticks(()) elif method == "dendrogram": def fancy_dendrogram(*args, **kwargs): max_d = kwargs.pop('max_d', None) if max_d and 'color_threshold' not in kwargs: kwargs['color_threshold'] = max_d annotate_above = kwargs.pop('annotate_above', 0) ddata = dendrogram(*args, **kwargs) if not kwargs.get('no_plot', False): plt.xlabel('sample index or (cluster size)') plt.ylabel('distance') for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']): x = 0.5 * sum(i[1:3]) y = d[1] if y > annotate_above: plt.plot(x, y, 'o', c=c) plt.annotate("%.3g" % y, (x, y), xytext=(0, -5), textcoords='offset points', va='top', ha='center') if max_d: plt.axhline(y=max_d, c='k') return ddata plt.xlabel('sample index or (cluster size)') plt.ylabel('distance') fancy_dendrogram( self.base_model.Z, truncate_mode='lastp', # show only the last p merged clusters p=400, # show only the last p merged clusters leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels max_d=self.base_model.get_max_distance(), annotate_above=3, # useful in small plots so annotations don't overlap show_contracted=True, # to get a distribution impression in truncated branches ) else: core_samples_mask = np.zeros_like(self.labels, dtype=bool) if 'core_sample_indices_' in self.base_model.__dict__: core_samples_mask[self.base_model.core_sample_indices_] = True # Black removed and is used for noise instead. unique_labels = set(self.labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (self.labels == k) xy = reduced_data[class_member_mask & core_samples_mask] xy2 = reduced_data[class_member_mask & ~core_samples_mask] draw_obj.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) draw_obj.plot(xy2[:, 0], xy2[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=9) patch = mpatches.Rectangle([0, 0], 0, 0, color="black", label="score = " + str(self.get_silhouette_score())) draw_obj.legend(handles=[patch]) if axis is None: plt.title(title) else: axis.set_title(title) if show: plt.show()
class ClusterizationModel: """Model for clusterization Parameters ----------- n_clusters : integer, optional The dimension of the projection subspace. Attributes ---------- n_clusters : int, number of classes labels : list of int Labels of each point """ def __init__(self, n_clusters=8, model="agglomerative", **kwargs): self.n_clusters = n_clusters self.labels = [] self.X = [] self.model_name = model if model == "hierarchy": self.base_model = HierarchyModel(n_clusters, **kwargs) elif model == "KMeans": self.base_model = KMeans(n_clusters, **kwargs) elif model == "agglomerative": self.base_model = AgglomerativeClustering(linkage='ward', n_clusters=n_clusters, **kwargs) elif model == "dbscan": self.base_model = DBSCAN(**kwargs) else: self.base_model = cluster.SpectralClustering(n_clusters, **kwargs) self.model_name = "SpectralClustering" def _preproc_data(self, X): if isinstance(X, pd.DataFrame): return X.as_matrix() return X def fit(self, x, y=None): """Creates an affinity matrix for X using the selected affinity, then applies spectral clustering to this affinity matrix. Parameters ---------- x : The input samples, shape = [n_samples, n_features] Returns ------- self : object Returns self. """ self.X = x self.base_model.fit(self._preproc_data(x), y) self.labels = self.base_model.labels_ n_clusters_ = len(set(self.labels)) - (1 if -1 in self.labels else 0) if n_clusters_ != self.n_clusters: logging.warning( "Clustering model provides different cluster count than expected: %s instead of %s" % (n_clusters_, self.n_clusters)) self.n_clusters = n_clusters_ return self def get_mean_values(self): data = pd.DataFrame(self.X) data['label'] = pd.Series(self.labels) result = data.groupby('label').mean() result['Cluster size'] = data.groupby('label').count().iloc[:, 0] return result def get_labels(self): return self.labels def get_silhouette_score(self): from sklearn.metrics import silhouette_score return silhouette_score(self.X, self.labels, metric='euclidean') def draw_clusters(self, method=None, title=None, axis=None, show=True, **kwargs): data = self._preproc_data(self.X) reduced_data = PCA(n_components=2).fit_transform(data) if axis is None: draw_obj = plt else: draw_obj = axis if title is None: title = self.model_name + " %s clusters total" % self.n_clusters if method == "areas": # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 # Step size of the mesh. Decrease to increase the quality of the VQ. parts_n = kwargs.pop("parts_n", 10) h_x = ( x_max - x_min ) / parts_n # point in the mesh [x_min, m_max]x[y_min, y_max]. h_y = ( y_max - y_min ) / parts_n # point in the mesh [x_min, m_max]x[y_min, y_max]. xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y)) neighbors_classifier = KNeighborsClassifier().fit( reduced_data, self.labels) # Obtain labels for each point in mesh. Use last trained model. Z = neighbors_classifier.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) draw_obj.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') draw_obj.plot(reduced_data[:, 0], reduced_data[:, 1], 'o', markersize=9) if "cluster_centers_" in self.base_model.__dict__: # Plot the centroids as a white X centroids = self.base_model.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=9, color='w', zorder=10) # plt.xlim(x_min, x_max) # plt.ylim(y_min, y_max) # plt.xticks(()) # plt.yticks(()) elif method == "dendrogram": def fancy_dendrogram(*args, **kwargs): max_d = kwargs.pop('max_d', None) if max_d and 'color_threshold' not in kwargs: kwargs['color_threshold'] = max_d annotate_above = kwargs.pop('annotate_above', 0) ddata = dendrogram(*args, **kwargs) if not kwargs.get('no_plot', False): plt.xlabel('sample index or (cluster size)') plt.ylabel('distance') for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']): x = 0.5 * sum(i[1:3]) y = d[1] if y > annotate_above: plt.plot(x, y, 'o', c=c) plt.annotate("%.3g" % y, (x, y), xytext=(0, -5), textcoords='offset points', va='top', ha='center') if max_d: plt.axhline(y=max_d, c='k') return ddata plt.xlabel('sample index or (cluster size)') plt.ylabel('distance') fancy_dendrogram( self.base_model.Z, truncate_mode='lastp', # show only the last p merged clusters p=400, # show only the last p merged clusters leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels max_d=self.base_model.get_max_distance(), annotate_above= 3, # useful in small plots so annotations don't overlap show_contracted= True, # to get a distribution impression in truncated branches ) else: core_samples_mask = np.zeros_like(self.labels, dtype=bool) if 'core_sample_indices_' in self.base_model.__dict__: core_samples_mask[self.base_model.core_sample_indices_] = True # Black removed and is used for noise instead. unique_labels = set(self.labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = 'k' class_member_mask = (self.labels == k) xy = reduced_data[class_member_mask & core_samples_mask] xy2 = reduced_data[class_member_mask & ~core_samples_mask] draw_obj.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14) draw_obj.plot(xy2[:, 0], xy2[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=9) patch = mpatches.Rectangle([0, 0], 0, 0, color="black", label="score = " + str(self.get_silhouette_score())) draw_obj.legend(handles=[patch]) if axis is None: plt.title(title) else: axis.set_title(title) if show: plt.show()