Beispiel #1
0
 def check_maxRstat_one_cluster_linkage(self, i):
     # Tests maxRstat(Z, R, i) on linkage with one cluster.
     Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double)
     R = np.asarray([[0, 0, 0, 0.3]], dtype=np.double)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Beispiel #2
0
 def check_maxRstat_one_cluster_linkage(self, i):
     # Tests maxRstat(Z, R, i) on linkage with one cluster.
     Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double)
     R = np.asarray([[0, 0, 0, 0.3]], dtype=np.double)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Beispiel #3
0
 def check_maxRstat_Q_linkage(self, method, i):
     # Tests maxRstat(Z, R, i) on the Q data set
     X = hierarchy_test_data.Q_X
     Z = linkage(X, method)
     R = inconsistent(Z)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Beispiel #4
0
 def check_maxRstat_Q_linkage(self, method, i):
     # Tests maxRstat(Z, R, i) on the Q data set
     X = hierarchy_test_data.Q_X
     Z = linkage(X, method)
     R = inconsistent(Z)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Beispiel #5
0
 def check_maxRstat_Q_linkage(self, method, i):
     # Tests maxRstat(Z, R, i) on the Q data set
     X = eo['Q-X']
     Y = pdist(X)
     Z = linkage(X, method)
     R = inconsistent(Z)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Beispiel #6
0
    def process(self, **kwargs) -> Dict[str, Dict[np.ndarray, str]]:
        Z = kwargs['Linkage']
        IncM = kwargs['IncM']
        R = IncM['R']

        stat = self.ctrls['Stat'].currentText()

        if stat == 'mean':
            i = 0
        elif stat == 'stdev':
            i = 1
        elif stat == 'num_links':
            i = 2
        elif stat == 'inc_coef':
            i = 3

        MR = hierarchy.maxRstat(Z, R, i)

        return {'MaxStat': {'MR': MR, 'stat': stat}}
Beispiel #7
0
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels):
    """
    Generate silhoutte score based on hierarchical clustering.

    Args:
        dendroMatrix: list, occurance of words in different files
        distance_metric: string, style of distance metric in the dendrogram
        linkage_method: string, style of linkage method in the dendrogram
        labels: list, file names

    Returns:
        silhouetteScore: string, containing the result of silhouette score 
        silhouetteAnnotation: string, annotation of the silhouette score
        score: float, silhouette score
        inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion 
        maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion
        distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion
        distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion
        monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion
        monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion
        threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank
    """
    activeFiles = len(labels) - 1
    if (
            activeFiles > 2
    ):  # since "number of lables should be more than 2 and less than n_samples - 1"
        Y = metrics.pairwise.pairwise_distances(dendroMatrix,
                                                metric=distance_metric)
        Z = hierarchy.linkage(Y, method=linkage_method)

        monocrit = None

        # 'maxclust' range
        maxclustMax = len(labels) - 1

        # 'incosistent' range
        R = hierarchy.inconsistent(Z, 2)
        inconsistentMax = R[-1][-1]
        slen = len('%.*f' % (2, inconsistentMax))
        inconsistentMax = float(str(inconsistentMax)[:slen])

        # 'distance' range
        d = hierarchy.cophenet(Z)
        distanceMax = d.max()
        slen = len('%.*f' % (2, distanceMax))
        distanceMax = float(str(distanceMax)[:slen])
        distanceMin = d.min() + 0.01
        slen = len('%.*f' % (2, distanceMin))
        distanceMin = float(str(distanceMin)[:slen])

        # 'monocrit' range
        MR = hierarchy.maxRstat(Z, R, 0)
        monocritMax = MR.max()
        slen = len('%.*f' % (2, monocritMax))
        monocritMax = float(str(monocritMax)[:slen])
        monocritMin = MR.min() + 0.01
        slen = len('%.*f' % (2, monocritMin))
        monocritMin = float(str(monocritMin)[:slen])

        threshold = request.form['threshold']
        if threshold == '':
            threshold = str(threshold)
        else:
            threshold = float(threshold)

        if request.form['criterion'] == 'maxclust':
            criterion = 'maxclust'
            if (threshold == '') or (threshold > maxclustMax):
                threshold = len(labels) - 1
            else:
                threshold = round(float(threshold))
        elif request.form['criterion'] == 'distance':
            criterion = 'distance'
            if (threshold == '') or (threshold > distanceMax) or (threshold <
                                                                  distanceMin):
                threshold = distanceMax
        elif request.form['criterion'] == 'inconsistent':
            criterion = 'inconsistent'
            if (threshold == '') or (threshold > inconsistentMax):
                threshold = inconsistentMax
        elif request.form['criterion'] == 'monocrit':
            criterion = 'monocrit'
            monocrit = MR
            if (threshold == '') or (threshold > monocritMax) or (threshold <
                                                                  monocritMin):
                threshold = monocritMax
        scoreLabel = hierarchy.fcluster(Z,
                                        t=threshold,
                                        criterion=criterion,
                                        monocrit=monocrit)

        if len(
                set(scoreLabel)
        ) <= 1:  # this means all the files are divided into only 1 or less cluster
            silhouetteScore = "Silhouette Score: invalid for only 1 cluster."
            silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster"
            score = 'invalid for only 1 cluster'
            inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A'
        else:
            score = metrics.silhouette_score(Y,
                                             labels=scoreLabel,
                                             metric='precomputed')
            score = round(score, constants.ROUND_DIGIT)
            inequality = '≤'.decode('utf-8')
            silhouetteScore = "Silhouette Score: " + str(
                score
            ) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)"
            silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar."

    else:
        silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files."
        silhouetteAnnotation = ""
        score = 'invalid for less than or equal to 2 files.'
        threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A'

    return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
Beispiel #8
0
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels):
    """
    Generate silhoutte score based on hierarchical clustering.

    Args:
        dendroMatrix: list, occurance of words in different files
        distance_metric: string, style of distance metric in the dendrogram
        linkage_method: string, style of linkage method in the dendrogram
        labels: list, file names

    Returns:
        silhouetteScore: string, containing the result of silhouette score 
        silhouetteAnnotation: string, annotation of the silhouette score
        score: float, silhouette score
        inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion 
        maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion
        distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion
        distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion
        monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion
        monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion
        threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank
    """
    activeFiles = len(labels) - 1
    if (activeFiles > 2):  # since "number of lables should be more than 2 and less than n_samples - 1"
        Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric)
        Z = hierarchy.linkage(Y, method=linkage_method)

        monocrit = None

        # 'maxclust' range
        maxclustMax = len(labels) - 1

        # 'incosistent' range
        R = hierarchy.inconsistent(Z, 2)
        inconsistentMax = R[-1][-1]
        slen = len('%.*f' % (2, inconsistentMax))
        inconsistentMax = float(str(inconsistentMax)[:slen])

        # 'distance' range
        d = hierarchy.cophenet(Z)
        distanceMax = d.max()
        slen = len('%.*f' % (2, distanceMax))
        distanceMax = float(str(distanceMax)[:slen])
        distanceMin = d.min() + 0.01
        slen = len('%.*f' % (2, distanceMin))
        distanceMin = float(str(distanceMin)[:slen])

        # 'monocrit' range
        MR = hierarchy.maxRstat(Z, R, 0)
        monocritMax = MR.max()
        slen = len('%.*f' % (2, monocritMax))
        monocritMax = float(str(monocritMax)[:slen])
        monocritMin = MR.min() + 0.01
        slen = len('%.*f' % (2, monocritMin))
        monocritMin = float(str(monocritMin)[:slen])

        threshold = request.form['threshold']
        if threshold == '':
            threshold = str(threshold)
        else:
            threshold = float(threshold)

        if request.form['criterion'] == 'maxclust':
            criterion = 'maxclust'
            if (threshold == '') or (threshold > maxclustMax):
                threshold = len(labels) - 1
            else:
                threshold = round(float(threshold))
        elif request.form['criterion'] == 'distance':
            criterion = 'distance'
            if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin):
                threshold = distanceMax
        elif request.form['criterion'] == 'inconsistent':
            criterion = 'inconsistent'
            if (threshold == '') or (threshold > inconsistentMax):
                threshold = inconsistentMax
        elif request.form['criterion'] == 'monocrit':
            criterion = 'monocrit'
            monocrit = MR
            if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin):
                threshold = monocritMax
        scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit)

        if len(set(scoreLabel)) <= 1:  # this means all the files are divided into only 1 or less cluster
            silhouetteScore = "Silhouette Score: invalid for only 1 cluster."
            silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster"
            score = 'invalid for only 1 cluster'
            inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A'
        else:
            score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed')
            score = round(score, constants.ROUND_DIGIT)
            inequality = '≤'.decode('utf-8')
            silhouetteScore = "Silhouette Score: " + str(
                score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)"
            silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar."

    else:
        silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files."
        silhouetteAnnotation = ""
        score = 'invalid for less than or equal to 2 files.'
        threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A'


    return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
Beispiel #9
0
def get_clusters_Hierarchy_clustering(x, hier_dict):
    #default value
    L_method = 'single'
    L_metric = 'euclidean'
    t = 0.9
    criterionH = 'inconsistent'
    depth = 2
    R = None
    colR = 3
    #L_metric can be 'braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’,
    #‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
    # ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
    #‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’,
    #‘sokalsneath’, ‘sqeuclidean’
    #**Note that ‘jensenshannon’,‘yule’may result in a condensed distance matrix which contains infinite value
    if 'L_metric' in hier_dict.keys():
        L_metric = hier_dict['L_metric']

# L_method can be 'single', 'complete','average','weighted','centroid','median','ward'
    if 'L_method' in hier_dict.keys():
        L_method = hier_dict['L_method']
    if L_method == 'centroid' or L_method == 'median' or L_method == 'ward':
        if L_metric != 'euclidean':
            L_metric = 'euclidean'
            print('\n')
            print('*************Note:**************')
            print('Method ' + str(L_method) +
                  ' requires the distance metric to be Euclidean')

    if 'optimal_ordering' in hier_dict.keys():
        optimal_ordering = hier_dict['optimal_ordering']
    else:
        optimal_ordering = False
    Z = linkage(x,
                method=L_method,
                metric=L_metric,
                optimal_ordering=optimal_ordering)
    #criterion can be
    if 'criterionH' in hier_dict.keys():
        criterionH = hier_dict['criterionH']
    else:
        criterionH = 'inconsistent'
    if 'depth' in hier_dict.keys():
        depth = hier_dict['depth']
    else:
        depth = 2
    if 't' in hier_dict.keys():
        t = hier_dict['t']
        #for 'maxclust' or 'maxclust_monocrit' criteria,
        #t would be max number of clusters requested.
    elif criterionH == 'maxclust_monocrit' or criterionH == 'maxclust':
        t = 20

    if 'R' in hier_dict.keys():
        R = hier_dict['R']
    if criterionH == 'inconsistent' or criterionH == 'maxclust_monocrit':
        #The inconsistency matrix to use for the 'inconsistent' criterion.
        #R is computed if not provided.
        if R is None:
            R = inconsistent(Z, d=depth)
        else:
            R = np.asarray(R, order='c')
    if criterionH == 'monocrit':
        if R is None:
            R = inconsistent(Z, d=depth)
            #colR  is the column of 'R' to use as the statistic
        return fcluster(Z,
                        criterion='monocrit',
                        t=t,
                        monocrit=maxRstat(Z, R, colR))
    elif criterionH == 'maxclust_monocrit':
        return fcluster(Z,
                        criterion='maxclust_monocrit',
                        t=t,
                        monocrit=maxinconsts(Z, R))
    else:
        return fcluster(Z, criterion=criterionH, depth=depth, R=R, t=t)
Beispiel #10
0
	def _cluster_by_monocrit(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray:
		MR = hierarchy.maxRstat(linkage_table, inconsistent.values, 1)
		clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'monocrit', monocrit = MR)
		return clusters