Beispiel #1
0
    def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @datatype - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        print("Clustering rows:", row_distance, row_linkage)
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))
            elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))


        if not self.missing_values is False:
            self.data = self.__return_missing_values__(self.data, self.missing_values_indexes)
        self.column_clustering = []

        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)
        
        if self.write_original or self.datatype == "nominal":
            self.data = self.original_data
    def test_basic_clustering(self):
        data = [
            [1.0, 2.0],
            [2.0, 1.0],
            [2.1, 1.1],
            [2, 1.1],
            [1.0, 2.1],
        ]
        data = np.array(data)

        dist = fastcluster.pdist(data)
        result = fastcluster.linkage(dist).tolist()

        assert_that(int(result[0][0])).is_equal_to(0)
        assert_that(int(result[0][1])).is_equal_to(4)
        assert_that(result[0][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[0][3])).is_equal_to(2)

        assert_that(int(result[1][0])).is_equal_to(1)
        assert_that(int(result[1][1])).is_equal_to(3)
        assert_that(result[1][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[1][3])).is_equal_to(2)

        assert_that(int(result[2][0])).is_equal_to(2)
        assert_that(int(result[2][1])).is_equal_to(6)
        assert_that(result[2][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[2][3])).is_equal_to(3)

        assert_that(int(result[3][0])).is_equal_to(5)
        assert_that(int(result[3][1])).is_equal_to(7)
        assert_that(result[3][2]).is_close_to(1.34536, 0.00001)
        assert_that(int(result[3][3])).is_equal_to(5)
    def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @datatype - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        print("Clustering rows:", row_distance, row_linkage)
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))
            elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))


        if not self.missing_value is False:
            self.data = self.__return_missing_values__(self.data, self.missing_values_indexes)
        self.column_clustering = []

        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)
        
        if self.write_original or self.datatype == "nominal":
            self.data = self.original_data
Beispiel #4
0
    def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @data_type - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        
        print("Clustering rows:", row_distance, row_linkage)
        self.data_type = data_type
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if data_type in DISTANCES and not row_distance in DISTANCES[data_type]:
                raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])]))
            elif not data_type in DISTANCES.keys():
                raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))

        self.column_clustering = []
        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)

        if self.write_original:
            self.data = self.original_data

        return
Beispiel #5
0
def optimal_linkage(data, rows=True, method='ward', metric='euclidean'):
    if not rows:
        data = data.T

    distance = fastcluster.pdist(data, metric=metric)
    linkage = fastcluster.linkage(distance, method=method)
    optimal_linkage = polo.optimal_leaf_ordering(linkage, distance)
    return optimal_linkage
Beispiel #6
0
def cal_cophenetic(C):
	""" calculate cophenetic correlation coefficient """
	print("=== calculate cophenetic correlation coefficient ===")
	X = C  # Original data (1000 observations)
	"""Z = linkage(X)"""
	Z = fc.linkage_vector(X)         # Clustering
	orign_dists = fc.pdist(X)  # Matrix of original distances between observations
	cophe_dists = cophenet(Z)  # Matrix of cophenetic distances between observations
	corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1]
	return corr_coef
Beispiel #7
0
    def cophenetic_correlation(consensus_matrix):
        """Calculates the cophentic correlation co-efficient from a consensus matrix.

        Arguments:
                consensus_matrix (np.array): the unordered consensus matrix

        Returns:
                int: the cophenetic correlation co-efficient
        """
        ori_dists = fc.pdist(consensus_matrix)
        Z = fc.linkage(ori_dists, method="average")
        [coph_corr, temporary] = cophenet(Z, ori_dists)
        return coph_corr
Beispiel #8
0
def _cluster_peaks(mzs: Sequence[float],
                   ppm: float,
                   distype: str = 'euclidean',
                   linkmode: str = 'centroid'):
    if len(mzs) == 0:
        return np.array([])
    if len(mzs) == 1:
        return np.zeros_like(mzs, dtype=int).reshape((-1, 1))

    outer_mzs = np.add.outer(mzs, mzs)
    np.fill_diagonal(outer_mzs, 0)

    # avg_mz_pair = np.divide(outer_mzs, 2)
    outer_mzs /= 2  # inplace operation to reduce memory usage

    # mdist_mz_pair = squareform(avg_mz_pair)
    mdist_mz_pair = squareform(outer_mzs)
    del outer_mzs  # reduce memory use

    m = np.column_stack([mzs])
    mdist = fc.pdist(m, metric=distype)
    del m

    # relative_errors = np.multiply(mdist_mz_pair, 1e-6)
    mdist_mz_pair *= 1e-6  # inplace operation to reduce memory usage

    with np.errstate(divide='ignore', invalid='ignore'
                     ):  # using errstate context to avoid seterr side effects
        # m_mass_tol = np.divide(mdist, relative_errors)
        mdist /= mdist_mz_pair  # inplace operation to reduce memory usage
        # m_mass_tol[np.isnan(m_mass_tol)] = 0.0
        mdist[np.isnan(mdist)] = 0.0

    # z = fc.linkage(m_mass_tol, method=linkmode)
    z = fc.linkage(mdist, method=linkmode)
    del mdist, mdist_mz_pair

    # cut tree at ppm threshold
    return cluster.hierarchy.cut_tree(z, height=ppm)
Beispiel #9
0
def _cluster_peaks(mzs, ppm, distype='euclidean', linkmode='centroid'):
    if len(mzs) == 0:
        return np.array([])
    if len(mzs) == 1:
        return np.zeros_like(mzs, dtype=int).reshape((-1, 1))

    m = np.column_stack([mzs])
    mdist = fc.pdist(m, metric=distype)

    outer_mzs = np.add.outer(mzs, mzs)
    np.fill_diagonal(outer_mzs, 0)
    avg_mz_pair = np.divide(outer_mzs, 2)
    mdist_mz_pair = squareform(avg_mz_pair)
    relative_errors = np.multiply(mdist_mz_pair, 1e-6)

    with np.errstate(divide='ignore', invalid='ignore'
                     ):  # using errstate context to avoid seterr side effects
        m_mass_tol = np.divide(mdist, relative_errors)
        m_mass_tol[np.isnan(m_mass_tol)] = 0.0
    z = fc.linkage(m_mass_tol, method=linkmode)

    # cut tree at ppm threshold & order matches the order of mzs
    return cluster.hierarchy.cut_tree(z, height=ppm)