def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @datatype - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) if not self.missing_values is False: self.data = self.__return_missing_values__(self.data, self.missing_values_indexes) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original or self.datatype == "nominal": self.data = self.original_data
def test_basic_clustering(self): data = [ [1.0, 2.0], [2.0, 1.0], [2.1, 1.1], [2, 1.1], [1.0, 2.1], ] data = np.array(data) dist = fastcluster.pdist(data) result = fastcluster.linkage(dist).tolist() assert_that(int(result[0][0])).is_equal_to(0) assert_that(int(result[0][1])).is_equal_to(4) assert_that(result[0][2]).is_close_to(0.1, 0.00001) assert_that(int(result[0][3])).is_equal_to(2) assert_that(int(result[1][0])).is_equal_to(1) assert_that(int(result[1][1])).is_equal_to(3) assert_that(result[1][2]).is_close_to(0.1, 0.00001) assert_that(int(result[1][3])).is_equal_to(2) assert_that(int(result[2][0])).is_equal_to(2) assert_that(int(result[2][1])).is_equal_to(6) assert_that(result[2][2]).is_close_to(0.1, 0.00001) assert_that(int(result[2][3])).is_equal_to(3) assert_that(int(result[3][0])).is_equal_to(5) assert_that(int(result[3][1])).is_equal_to(7) assert_that(result[3][2]).is_close_to(1.34536, 0.00001) assert_that(int(result[3][3])).is_equal_to(5)
def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @datatype - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) if not self.missing_value is False: self.data = self.__return_missing_values__(self.data, self.missing_values_indexes) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original or self.datatype == "nominal": self.data = self.original_data
def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @data_type - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.data_type = data_type self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if data_type in DISTANCES and not row_distance in DISTANCES[data_type]: raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])])) elif not data_type in DISTANCES.keys(): raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original: self.data = self.original_data return
def optimal_linkage(data, rows=True, method='ward', metric='euclidean'): if not rows: data = data.T distance = fastcluster.pdist(data, metric=metric) linkage = fastcluster.linkage(distance, method=method) optimal_linkage = polo.optimal_leaf_ordering(linkage, distance) return optimal_linkage
def cal_cophenetic(C): """ calculate cophenetic correlation coefficient """ print("=== calculate cophenetic correlation coefficient ===") X = C # Original data (1000 observations) """Z = linkage(X)""" Z = fc.linkage_vector(X) # Clustering orign_dists = fc.pdist(X) # Matrix of original distances between observations cophe_dists = cophenet(Z) # Matrix of cophenetic distances between observations corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1] return corr_coef
def cophenetic_correlation(consensus_matrix): """Calculates the cophentic correlation co-efficient from a consensus matrix. Arguments: consensus_matrix (np.array): the unordered consensus matrix Returns: int: the cophenetic correlation co-efficient """ ori_dists = fc.pdist(consensus_matrix) Z = fc.linkage(ori_dists, method="average") [coph_corr, temporary] = cophenet(Z, ori_dists) return coph_corr
def _cluster_peaks(mzs: Sequence[float], ppm: float, distype: str = 'euclidean', linkmode: str = 'centroid'): if len(mzs) == 0: return np.array([]) if len(mzs) == 1: return np.zeros_like(mzs, dtype=int).reshape((-1, 1)) outer_mzs = np.add.outer(mzs, mzs) np.fill_diagonal(outer_mzs, 0) # avg_mz_pair = np.divide(outer_mzs, 2) outer_mzs /= 2 # inplace operation to reduce memory usage # mdist_mz_pair = squareform(avg_mz_pair) mdist_mz_pair = squareform(outer_mzs) del outer_mzs # reduce memory use m = np.column_stack([mzs]) mdist = fc.pdist(m, metric=distype) del m # relative_errors = np.multiply(mdist_mz_pair, 1e-6) mdist_mz_pair *= 1e-6 # inplace operation to reduce memory usage with np.errstate(divide='ignore', invalid='ignore' ): # using errstate context to avoid seterr side effects # m_mass_tol = np.divide(mdist, relative_errors) mdist /= mdist_mz_pair # inplace operation to reduce memory usage # m_mass_tol[np.isnan(m_mass_tol)] = 0.0 mdist[np.isnan(mdist)] = 0.0 # z = fc.linkage(m_mass_tol, method=linkmode) z = fc.linkage(mdist, method=linkmode) del mdist, mdist_mz_pair # cut tree at ppm threshold return cluster.hierarchy.cut_tree(z, height=ppm)
def _cluster_peaks(mzs, ppm, distype='euclidean', linkmode='centroid'): if len(mzs) == 0: return np.array([]) if len(mzs) == 1: return np.zeros_like(mzs, dtype=int).reshape((-1, 1)) m = np.column_stack([mzs]) mdist = fc.pdist(m, metric=distype) outer_mzs = np.add.outer(mzs, mzs) np.fill_diagonal(outer_mzs, 0) avg_mz_pair = np.divide(outer_mzs, 2) mdist_mz_pair = squareform(avg_mz_pair) relative_errors = np.multiply(mdist_mz_pair, 1e-6) with np.errstate(divide='ignore', invalid='ignore' ): # using errstate context to avoid seterr side effects m_mass_tol = np.divide(mdist, relative_errors) m_mass_tol[np.isnan(m_mass_tol)] = 0.0 z = fc.linkage(m_mass_tol, method=linkmode) # cut tree at ppm threshold & order matches the order of mzs return cluster.hierarchy.cut_tree(z, height=ppm)