Ejemplo n.º 1
0
def flatcluster(
    dRow, runLogs, interClusterDistance="complete", plotDendrogram=True, cMethod="inconsistent", cValue=2.5
):
    #    if 'inter-cluster distance' in clusterSetup.keys():
    #        method = clusterSetup['inter-cluster distance']
    #    else:
    #        method = 'complete'

    z = linkage(dRow, interClusterDistance)
    inc = inconsistent(z)
    # print inc

    if plotDendrogram:
        plotdendrogram(z)

    clusters = fcluster(z, cValue, cMethod)

    noClusters = max(clusters)
    print("Total number of clusters:", noClusters)
    for i in range(noClusters):
        counter = 0
        for j in range(len(clusters)):
            if clusters[j] == (i + 1):
                counter += 1
        print("Cluster", str(i + 1), ":", str(counter))

    global clusterCount
    clusterCount = noClusters
    print(len(clusters))
    print(len(runLogs))
    for i, log in enumerate(runLogs):
        log[0]["Cluster"] = str(clusters[i])

    return z, clusters, runLogs
Ejemplo n.º 2
0
    def _run_hier_clust_on_centroids(self,method='average'):
        '''
        runs hierarchical clustering based on the centroids of the data per scipy's methods

        '''

        uniqueLabels = np.sort(np.unique(self.templateLabels))
        centroids = np.array([self.templateMat[np.where(self.templateLabels == i)[0],:].mean(axis=0) for i in uniqueLabels])
               
        self.y = pdist(centroids)
        self.z = hierarchy.linkage(self.y,method)
        r2 = hierarchy.inconsistent(self.z,2)

        ## rank the average of linkage hieghts by standard deviation the report the averages
        meanHeights = r2[:,0]
        stdHeights = r2[:,1]
        rankedInds = np.argsort(stdHeights)[::-1]
        bestCutPoints = meanHeights[rankedInds]

        ## save centroid labels for all cuts of the dentragram
        allCentroidLabels = {}
        rankedK = []
        for cp in bestCutPoints:
            centroidLabels = hierarchy.fcluster(self.z,t=cp,criterion='distance')
            k = len(np.unique(centroidLabels))
            if allCentroidLabels.has_key(str(k)) == True:
                continue
            
            allCentroidLabels[str(k)] = centroidLabels 
            rankedK.append(k)
        
        centroidLabels = allCentroidLabels[str(rankedK[0])]
    
        ## save the top xx modes 
        self.bestModeLabels = []
        print 'doing ranking...'
       
        for rk in rankedK[:25]:
            centroidLabels = allCentroidLabels[str(rk)]
            modeLabels = self._get_mode_labels(self.templateLabels,centroidLabels,uniqueLabels)
            self.bestModeLabels.append(modeLabels)
        
        ## provide silvalue ranks in case we wish to reorder the top xx modes by sil value
        self.modeSilValues = []
        self.modeSizes = []
        allEvents = [self.templateData]

        for count in range(len(self.bestModeLabels)):
            numClusters = np.unique(self.bestModeLabels[count]).size
            silValues = get_silhouette_values(allEvents,[self.bestModeLabels[count]],subsample=self.noiseSample,
                                              minNumEvents=5000,resultsType='raw')
            silMean = silValues['0'].mean()
            self.modeSilValues.append(silValues['0'].mean())
            self.modeSizes.append(numClusters)

        silValues = get_silhouette_values(allEvents,[self.templateLabels],subsample=self.noiseSample,
                                          minNumEvents=5000,resultsType='raw')
        self.clusterSilValues = silValues['0'].mean()
        self.modeSilValues = np.array(self.modeSilValues)
        self.modeSizes = np.array(self.modeSizes)
Ejemplo n.º 3
0
 def check_maxRstat_Q_linkage(self, method, i):
     # Tests maxRstat(Z, R, i) on the Q data set
     X = hierarchy_test_data.Q_X
     Z = linkage(X, method)
     R = inconsistent(Z)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Ejemplo n.º 4
0
 def test_is_valid_im_4_and_up(self):
     # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
     # (step size 3).
     for i in xrange(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         R = inconsistent(Z)
         assert_(is_valid_im(R) == True)
Ejemplo n.º 5
0
 def check_maxRstat_Q_linkage(self, method, i):
     # Tests maxRstat(Z, R, i) on the Q data set
     X = eo['Q-X']
     Y = pdist(X)
     Z = linkage(X, method)
     R = inconsistent(Z)
     MD = maxRstat(Z, R, 1)
     expectedMD = calculate_maximum_inconsistencies(Z, R, 1)
     assert_allclose(MD, expectedMD, atol=1e-15)
Ejemplo n.º 6
0
 def test_is_valid_im_4_and_up_neg_dist(self):
     # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
     # (step size 3) with negative link counts.
     for i in range(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         R = inconsistent(Z)
         R[i//2,2] = -0.5
         assert_(is_valid_im(R) == False)
         assert_raises(ValueError, is_valid_im, R, throw=True)
Ejemplo n.º 7
0
 def test_is_valid_im_4_and_up_neg_dist(self):
     # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
     # (step size 3) with negative link counts.
     for i in xrange(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         R = inconsistent(Z)
         R[i//2,2] = -0.5
         assert_(is_valid_im(R) == False)
         assert_raises(ValueError, is_valid_im, R, throw=True)
Ejemplo n.º 8
0
 def test_is_valid_im_4_and_up_neg_index_right(self):
     # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15
     # (step size 3) with negative link height standard deviations.
     for i in xrange(4, 15, 3):
         y = np.random.rand(i * (i - 1) // 2)
         Z = linkage(y)
         R = inconsistent(Z)
         R[i // 2, 1] = -2.0
         assert_(is_valid_im(R) == False)
         assert_raises(ValueError, is_valid_im, R, throw=True)
Ejemplo n.º 9
0
def inconsist(Z, depth):
    from scipy.cluster.hierarchy import inconsistent

    incons = inconsistent(Z, depth)  # optional , depth)
    # link statistics
    # print incons[-40:][5]  # [-10:]
    # print max(incons[:][5])  # [-10:]
    # print min(incons[:][5])  # [-10:]
    # print incons#[:][5].transpose()
    print incons[:, 3]  #.transpose()
    print max(incons[:, 3])  #.transpose()
    print min(incons[:, 3])  #.transpose()
Ejemplo n.º 10
0
def Hierarchy(V, **kwargs):
    """Performs hierarchical clustering on *V*. The function essentially uses two scipy functions: ``linkage`` and 
    ``fcluster``. See :func:`scipy.cluster.hierarchy.linkage` and :func:`scipy.cluster.hierarchy.fcluster` for the 
    explaination of the arguments. Here lists arguments that are different from those of scipy.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg inconsistent_percentile: if the clustering *criterion* for :func:`scipy.cluster.hierarchy.fcluster`
    is ``inconsistent`` and threshold *t* is not given (default), then the function will use the percentile specified 
    by this argument as the threshold.
    :type inconsistent_percentile: double

    :arg n_clusters: specifies the maximal number of clusters. If this argument is given, then the function will 
    automatically set *criterion* to ``maxclust`` and *t* equal to *n_clusters*.
    :type n_clusters: int
    """

    try:
        from scipy.cluster.hierarchy import linkage, fcluster, inconsistent
    except ImportError:
        raise ImportError('Use of this function (Hierarchy) requires the '
                          'installation of scipy.')

    method = kwargs.pop('method', 'single')
    metric = kwargs.pop('metric', 'euclidean')
    Z = linkage(V, method=method, metric=metric)

    criterion = kwargs.pop('criterion', 'inconsistent')
    t = kwargs.get('t', None)
    ip = kwargs.pop('inconsistent_percentile', 99.9)
    if t is None and criterion == 'inconsistent':
        I = inconsistent(Z)
        i = np.percentile(I[:, 3], ip)

    t = kwargs.pop('t', i)
    depth = kwargs.pop('depth', 2)
    R = kwargs.pop('R', None)
    monocrit = kwargs.pop('monocrit', None)

    n_clusters = kwargs.pop('n_clusters', None)
    if n_clusters is not None:
        criterion = 'maxclust'
        t = n_clusters
    labels = fcluster(Z,
                      t,
                      criterion=criterion,
                      depth=depth,
                      R=R,
                      monocrit=monocrit)
    return labels.flatten()
Ejemplo n.º 11
0
def flatcluster(dMatrix, clusterSetup):
    dRow = prepareDRow(dMatrix)
    print dRow
    
    # Checking user-specified options, if there is any. Otherwise the default 
    # values are assigned
    if 'inter-cluster distance' in clusterSetup.keys():
        method = clusterSetup['inter-cluster distance']
    else:
        method = 'complete'
    z = linkage(dRow, method)
    inc = inconsistent(z)
    print inc
    
    if 'plotDendrogram?' in clusterSetup.keys():
        if clusterSetup['plotDendrogram?']:
            plotdendrogram(z)
        else:
            pass
    else:
        plotdendrogram(z)
    
    if 'cutoff criteria' in clusterSetup.keys():
        cmethod = clusterSetup['cutoff criteria']
    else:
        cmethod = 'inconsistent'
    
    if 'cutoff criteria value' in clusterSetup.keys():
        cvalue = clusterSetup['cutoff criteria value']
    else:
        cvalue = 2.5
    
    clusters = fcluster(z, cvalue, cmethod)
    
    noClusters = max(clusters)
    print 'Total number of clusters:', noClusters
    for i in range(noClusters):
        counter = 0
        for j in range(len(clusters)):
            if clusters[j]==(i+1):
                counter+=1
        print "Cluster",str(i+1),":",str(counter)
    
    for runIndex in range(len(clusters)):
        global runLogs
        runLogs[runIndex][0]['Cluster'] = str(clusters[runIndex])
        global clusterCount
        if clusters[runIndex] > clusterCount:
            clusterCount = clusters[runIndex]
    return clusters
Ejemplo n.º 12
0
def do_hcluster(linkMe,metric = ['euclidean','cityblock','correlation'],method = ['single','complete','average','centroid','median','ward']):        

    # linkMe: m x n DataFrame; m: sample no. n: time point
    # metric: metrics to try
    # method: methods to try
    
    writeMe = []
    # do linkage
    coph = {}
    inconsist = {}
    links = {}
    for mtc in metric:
        for mtd in method:
            if (mtd=='centroid' or mtd=='median' or mtd=='ward') and not(mtc=='euclidean'):
                continue
            
            # write to file
            writeMe.append('\n\n---------------------------------------------------------------------------------------------------------------------\n' + 
                           mtc + ' ' + mtd + '\n---------------------------------------------------------------------------------------------------------------------\n')
            
            if not(mtc in links):
                links[mtc] = {mtd:hier.linkage(linkMe,method=mtd,metric=mtc,optimal_ordering=True)}
                coph[mtc] = {mtd: hier.cophenet(links[mtc][mtd],pdist(linkMe))}
                inconsist[mtc] = {mtd: hier.inconsistent(links[mtc][mtd])}
            else:
                links[mtc].update({mtd:hier.linkage(linkMe,method=mtd,metric=mtc,optimal_ordering=True)})
                coph[mtc].update({mtd: hier.cophenet(links[mtc][mtd],pdist(linkMe))})
                inconsist[mtc].update({mtd: hier.inconsistent(links[mtc][mtd])})
            
            # append diagnostics
            writeMe.append('Inconsistency Matrix\n')
            writeMe.append(np.array2string(inconsist[mtc][mtd]))
            writeMe.append('\n\nCophenetic distance\n')
            writeMe.append(np.array2string(squareform(coph[mtc][mtd][1])))
            writeMe.append('\n\nCophenetic Correlation Coefficient: ')
            writeMe.append(np.array2string(coph[mtc][mtd][0]))
    return links, coph, inconsist, writeMe
Ejemplo n.º 13
0
def Hierarchy(V, **kwargs):
    """Performs hierarchical clustering on *V*. The function essentially uses two scipy functions: ``linkage`` and 
    ``fcluster``. See :func:`scipy.cluster.hierarchy.linkage` and :func:`scipy.cluster.hierarchy.fcluster` for the 
    explaination of the arguments. Here lists arguments that are different from those of scipy.

    :arg V: row-normalized eigenvectors for the purpose of clustering.
    :type V: :class:`numpy.ndarray`

    :arg inconsistent_percentile: if the clustering *criterion* for :func:`scipy.cluster.hierarchy.fcluster`
    is ``inconsistent`` and threshold *t* is not given (default), then the function will use the percentile specified 
    by this argument as the threshold.
    :type inconsistent_percentile: double

    :arg n_clusters: specifies the maximal number of clusters. If this argument is given, then the function will 
    automatically set *criterion* to ``maxclust`` and *t* equal to *n_clusters*.
    :type n_clusters: int
    """

    from scipy.cluster.hierarchy import linkage, fcluster, inconsistent
    
    method = kwargs.pop('method', 'single')
    metric = kwargs.pop('metric', 'euclidean')
    Z = linkage(V, method=method, metric=metric)
    
    criterion = kwargs.pop('criterion', 'inconsistent')
    t = kwargs.get('t', None)
    ip = kwargs.pop('inconsistent_percentile', 99.9)
    if t is None and criterion == 'inconsistent':
        I = inconsistent(Z)
        i = np.percentile(I[:,3], ip)

    t = kwargs.pop('t', i)
    depth = kwargs.pop('depth', 2)
    R = kwargs.pop('R', None)
    monocrit = kwargs.pop('monocrit', None)

    n_clusters = kwargs.pop('n_clusters', None)
    if n_clusters is not None:
        criterion = 'maxclust'
        t = n_clusters
    labels = fcluster(Z, t, criterion=criterion, depth=depth, R=R, monocrit=monocrit)
    return labels.flatten()
Ejemplo n.º 14
0
def fclusterdata(X, t, criterion='distance',
                     metric='euclidean', depth=2, method='single', R=None):
    """
    This is adapted from scipy fclusterdata.
    https://github.com/scipy/scipy/blob/v1.0.0/scipy/cluster/hierarchy.py#L1809-L1878
    """
    X = np.asarray(X, order='c', dtype=np.double)

    if type(X) != np.ndarray or len(X.shape) != 2:
        print(type(X), X.shape)
        raise TypeError('The observation matrix X must be an n by m numpy '
                        'array.')

    Y = distance.pdist(X, metric=metric)
    Z = linkage(Y, method=method)
    if R is None:
        R = inconsistent(Z, d=depth)
    else:
        R = np.asarray(R, order='c')
    T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
    return T, Z
Ejemplo n.º 15
0
def sax_hcluster(linkMe,method=['single','complete','average']):
    # linkMe is a compressed vector of minimum distances
    
    writeMe = []
    coph = {}
    inconsist = {}
    links = {}
    for mtd in method:
        writeMe.append('\n\n---------------------------------------------------------------------------------------------------------------------\n' + 
                       mtd + '\n---------------------------------------------------------------------------------------------------------------------\n')
        links[mtd] = hier.linkage(linkMe,method=mtd,optimal_ordering=True)
        coph[mtd]  = hier.cophenet(links[mtd],linkMe)
        inconsist[mtd] = hier.inconsistent(links[mtd])
        
        # append diagnostics
        writeMe.append('Inconsistency Matrix\n')
        writeMe.append(np.array2string(inconsist[mtd]))
        writeMe.append('\n\nCophenetic distance\n')
        writeMe.append(np.array2string(squareform(coph[mtd][1])))
        writeMe.append('\n\nCophenetic Correlation Coefficient: ')
        writeMe.append(np.array2string(coph[mtd][0]))
        
    return links, coph, inconsist, writeMe
Ejemplo n.º 16
0
def flatcluster(dRow,
                runLogs,
                interClusterDistance='complete',
                plotDendrogram=True,
                cMethod='inconsistent',
                cValue=2.5):
    #    if 'inter-cluster distance' in clusterSetup.keys():
    #        method = clusterSetup['inter-cluster distance']
    #    else:
    #        method = 'complete'

    z = linkage(dRow, interClusterDistance)
    inc = inconsistent(z)
    #print inc

    if plotDendrogram:
        plotdendrogram(z)

    clusters = fcluster(z, cValue, cMethod)

    noClusters = max(clusters)
    print('Total number of clusters:', noClusters)
    for i in range(noClusters):
        counter = 0
        for j in range(len(clusters)):
            if clusters[j] == (i + 1):
                counter += 1
        print("Cluster", str(i + 1), ":", str(counter))

    global clusterCount
    clusterCount = noClusters
    print(len(clusters))
    print(len(runLogs))
    for i, log in enumerate(runLogs):
        log[0]['Cluster'] = str(clusters[i])

    return z, clusters, runLogs
Ejemplo n.º 17
0
 def check_inconsistent_tdist(self, depth):
     Z = hierarchy_test_data.linkage_ytdist_single
     assert_allclose(inconsistent(Z, depth),
                     hierarchy_test_data.inconsistent_ytdist[depth])
Ejemplo n.º 18
0
from scipy.cluster.hierarchy import inconsistent, linkage
from matplotlib import pyplot as plt
X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
Z = linkage(X, 'ward')
print(Z)
# [[ 5.          6.          0.          2.        ]
# [ 2.          7.          0.          2.        ]
# [ 0.          4.          1.          2.        ]
# [ 1.          8.          1.15470054  3.        ]
# [ 9.         10.          2.12132034  4.        ]
# [ 3.         12.          4.11096096  5.        ]
# [11.         13.         14.07183949  8.        ]]
inconsistent(Z)
# array([[ 0.        ,  0.        ,  1.        ,  0.        ],
# [ 0.        ,  0.        ,  1.        ,  0.        ],
# [ 1.        ,  0.        ,  1.        ,  0.        ],
# [ 0.57735027,  0.81649658,  2.        ,  0.70710678],
# [ 1.04044011,  1.06123822,  3.        ,  1.01850858],
# [ 3.11614065,  1.40688837,  2.        ,  0.70710678],
# [ 6.44583366,  6.76770586,  3.        ,  1.12682288]])
Ejemplo n.º 19
0
	def _get_inconsistent(distances: numpy.ndarray) -> pandas.DataFrame:
		inconsistent = hierarchy.inconsistent(distances, 10)
		inc = pandas.DataFrame(inconsistent)
		inc.columns = ["mean", "std", "count", "coefficient"]
		return inc
Ejemplo n.º 20
0
    centroids = np.array([case1[np.where(case1Labels == i)[0],:].mean(axis=0) for i in uniqueLabels])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ncluster = 27
    y = pdist(centroids)
    method = 'centroid'#'average'
    z = hierarchy.linkage(y,'average')
    #t = hierarchy.fcluster(27,criterion='maxclust')

    ## computes the max distance between any cluster and ea non singleton cluster
    print 'max dists', hierarchy.maxdists(z)
    

    ## inconsistancy
    r = hierarchy.inconsistent(z)
    print 'r',r
    #print 'max inconsts', hierarchy.maxinconsts(z,r,i)
    print 'z',z
    #print 'blah', z[:,2] - np.array(z[1:,2].tolist()+[0])
    print z[:,2]
    print np.hstack([z[1:,2],[0]])
    levelDiffs = np.abs(z[:,2] - np.hstack([z[1:,2],[0]]))
    levelDiffMeans = z[:,2]# - 0.001 #np.hstack([z[1:,2],[0]]) / 2.0#z[:,2] + np.hstack([z[1:,2],[0]]) / 2.0

    print 'diffs',levelDiffs*100
    diffInds = np.argsort(levelDiffs)
    diffInds = diffInds[::-1]
    print 'a',levelDiffMeans[diffInds[:6]] #,levelDiffMeans[diffInds[1]],levelDiffMeans[diffInds[2]],levelDiffMeans[diffInds[3]]
    print 'b', diffInds[:5]
    hierarchy.dendrogram(z)
Ejemplo n.º 21
0
#     plt.title('Hierarchical Clustering Dendrogram (truncated)')
#     plt.xlabel('sample index or (cluster size)')
#     plt.ylabel('distance')
#     dendrogram(
#         Z,
#         truncate_mode='lastp',  # show only the last p merged clusters
#         p=12,  # show only the last p merged clusters
#         leaf_rotation=90.,
#         leaf_font_size=12.,
#         show_contracted=True,  # to get a distribution impression in truncated branches
#     )
#     plt.show()
#     c, coph_dists = cophenet(Z, BCD_matrix)
#     print "Cophenetic Correlation Coefficient:",c
#     return Z

def Elbow(Z,lastp=10):
    last = Z[-lastp:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    plt.plot(idxs, last_rev)
    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    plt.plot(idxs[:-2] + 1, acceleration_rev)
    plt.show()
    k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
    print "clusters:", k

Elbow(Z,lastp=10)
incons = inconsistent(Z, 5)
fcluster(Z, 6, criterion='maxclust')
# generate two clusters: a with 100 points, b with 50:
np.random.seed(4711)  # for repeatability of this tutorial
a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])
X = np.concatenate((a, b),)
print X.shape  # 150 samples with 2 dimensions
#plt.scatter(X[:,0], X[:,1])
#plt.show()

# generate the linkage matrix
Z = linkage(X, 'ward')

print Z.shape

depth = 5
incons = inconsistent(Z, depth)
print incons[-10:]

last = Z[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)

acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
print "clusters:", k

Ejemplo n.º 23
0
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels):
    """
    Generate silhoutte score based on hierarchical clustering.

    Args:
        dendroMatrix: list, occurance of words in different files
        distance_metric: string, style of distance metric in the dendrogram
        linkage_method: string, style of linkage method in the dendrogram
        labels: list, file names

    Returns:
        silhouetteScore: string, containing the result of silhouette score 
        silhouetteAnnotation: string, annotation of the silhouette score
        score: float, silhouette score
        inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion 
        maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion
        distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion
        distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion
        monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion
        monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion
        threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank
    """
    activeFiles = len(labels) - 1
    if (
            activeFiles > 2
    ):  # since "number of lables should be more than 2 and less than n_samples - 1"
        Y = metrics.pairwise.pairwise_distances(dendroMatrix,
                                                metric=distance_metric)
        Z = hierarchy.linkage(Y, method=linkage_method)

        monocrit = None

        # 'maxclust' range
        maxclustMax = len(labels) - 1

        # 'incosistent' range
        R = hierarchy.inconsistent(Z, 2)
        inconsistentMax = R[-1][-1]
        slen = len('%.*f' % (2, inconsistentMax))
        inconsistentMax = float(str(inconsistentMax)[:slen])

        # 'distance' range
        d = hierarchy.cophenet(Z)
        distanceMax = d.max()
        slen = len('%.*f' % (2, distanceMax))
        distanceMax = float(str(distanceMax)[:slen])
        distanceMin = d.min() + 0.01
        slen = len('%.*f' % (2, distanceMin))
        distanceMin = float(str(distanceMin)[:slen])

        # 'monocrit' range
        MR = hierarchy.maxRstat(Z, R, 0)
        monocritMax = MR.max()
        slen = len('%.*f' % (2, monocritMax))
        monocritMax = float(str(monocritMax)[:slen])
        monocritMin = MR.min() + 0.01
        slen = len('%.*f' % (2, monocritMin))
        monocritMin = float(str(monocritMin)[:slen])

        threshold = request.form['threshold']
        if threshold == '':
            threshold = str(threshold)
        else:
            threshold = float(threshold)

        if request.form['criterion'] == 'maxclust':
            criterion = 'maxclust'
            if (threshold == '') or (threshold > maxclustMax):
                threshold = len(labels) - 1
            else:
                threshold = round(float(threshold))
        elif request.form['criterion'] == 'distance':
            criterion = 'distance'
            if (threshold == '') or (threshold > distanceMax) or (threshold <
                                                                  distanceMin):
                threshold = distanceMax
        elif request.form['criterion'] == 'inconsistent':
            criterion = 'inconsistent'
            if (threshold == '') or (threshold > inconsistentMax):
                threshold = inconsistentMax
        elif request.form['criterion'] == 'monocrit':
            criterion = 'monocrit'
            monocrit = MR
            if (threshold == '') or (threshold > monocritMax) or (threshold <
                                                                  monocritMin):
                threshold = monocritMax
        scoreLabel = hierarchy.fcluster(Z,
                                        t=threshold,
                                        criterion=criterion,
                                        monocrit=monocrit)

        if len(
                set(scoreLabel)
        ) <= 1:  # this means all the files are divided into only 1 or less cluster
            silhouetteScore = "Silhouette Score: invalid for only 1 cluster."
            silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster"
            score = 'invalid for only 1 cluster'
            inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A'
        else:
            score = metrics.silhouette_score(Y,
                                             labels=scoreLabel,
                                             metric='precomputed')
            score = round(score, constants.ROUND_DIGIT)
            inequality = '≤'.decode('utf-8')
            silhouetteScore = "Silhouette Score: " + str(
                score
            ) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)"
            silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar."

    else:
        silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files."
        silhouetteAnnotation = ""
        score = 'invalid for less than or equal to 2 files.'
        threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A'

    return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
Ejemplo n.º 24
0
    def compute(self):

        # ------------------- hole
        self.output_h={}

        # compute hole correlation matrix
        self.output_h['COR'] = self.calc_corr_matrix(self.OmFrag,userows=False,usecols=True)

        # compute distance matrix
        DIS = self.calc_dist_matrix(self.output_h['COR'])
        self.output_h['DIS'] = numpy.copy(DIS)

        # perform clustering
        Z=squareform( DIS, checks=False )
        Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] )
        cc,C=sch.cophenet(Y,Z)
        I=sch.inconsistent(Y)
        self.output_h['dendro'] = Y
        self.output_h['cophco'] = cc
        self.output_h['incons'] = I

        # analyze clusters
        self.output_h['cluster'], self.output_h['maxd']=self.find_clustering(Y)



        # ------------------- electron
        self.output_e={}

        # compute electron correlation matrix
        self.output_e['COR'] = self.calc_corr_matrix(self.OmFrag,userows=True,usecols=False)

        # compute distance matrix
        DIS = self.calc_dist_matrix(self.output_e['COR'])
        self.output_e['DIS'] = numpy.copy(DIS)

        # perform clustering
        Z=squareform( DIS, checks=False )
        Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] )
        cc,C=sch.cophenet(Y,Z)
        I=sch.inconsistent(Y)
        self.output_e['dendro'] = Y
        self.output_e['cophco'] = cc
        self.output_e['incons'] = I

        # analyze clusters
        self.output_e['cluster'], self.output_e['maxd']=self.find_clustering(Y)



        # ------------------- mixed
        self.output_m={}

        # compute mixed correlation matrix (maximum of hole and electron)
        self.output_m['COR'] = numpy.where(self.output_e['COR']>=self.output_h['COR'],
                                           self.output_e['COR'],
                                           self.output_h['COR'])

        # compute distance matrix
        DIS = self.calc_dist_matrix(self.output_m['COR'])
        self.output_m['DIS'] = numpy.copy(DIS)

        # perform clustering
        Z=squareform( DIS, checks=False )
        Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] )
        cc,C=sch.cophenet(Y,Z)
        I=sch.inconsistent(Y)
        self.output_m['dendro'] = Y
        self.output_m['cophco'] = cc
        self.output_m['incons'] = I

        # analyze clusters
        self.output_m['cluster'], self.output_m['maxd']=self.find_clustering(Y)
Ejemplo n.º 25
0
])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[
    50,
])
X = np.concatenate((a, b), )
print X.shape  # 150 samples with 2 dimensions
#plt.scatter(X[:,0], X[:,1])
#plt.show()

# generate the linkage matrix
Z = linkage(X, 'ward')

print Z.shape

depth = 5
incons = inconsistent(Z, depth)
print incons[-10:]

last = Z[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)

acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
k = acceleration_rev.argmax(
) + 2  # if idx 0 is the max of this we want 2 clusters
print "clusters:", k
def hierarchical_clusters(X,
                          window_size,
                          t,
                          threshold=0.05,
                          method='single',
                          depth=2,
                          criterion='distance',
                          n_jobs=1):
    """
    Cluster M time series into hierarchical clusters using agglomerative
    approach. This function is more or less a convenience wrapper around 
    SciPy's scipy.cluster.hierarchy functions, but uses the MPDist algorithm
    to compute distances between each pair of time series.

    Note
    ----
    Memory usage could potentially high depending on the length of your
    time series and how many distances are computed!
    
    Parameters
    ----------
    X : array_like
        An M x N matrix where M is the time series and N is the observations at
        a given time.
    window_size : int
        The window size used to compute the MPDist.
    t : scalar
        For criteria 'inconsistent', 'distance' or 'monocrit', this is the 
        threshold to apply when forming flat clusters.
        For 'maxclust' criteria, this would be max number of clusters 
        requested.
    threshold : float, Default 0.05
        The percentile in which the MPDist is taken from. By default it is
        set to 0.05 based on empircal research results from the paper. 
        Generally, you should not change this unless you know what you are
        doing! This value must be a float greater than 0 and less than 1.
    method : str, Default single
        The linkage algorithm to use.
        Options: {single, complete, average, weighted}
    depth : int, Default 2
        A non-negative value more than 0 to specify the number of levels below
        a non-singleton cluster to allow.
    criterion : str, Default distance
        Options: {inconsistent, distance, maxclust, monocrit}
        The criterion to use in forming flat clusters.
          ``inconsistent`` :
              If a cluster node and all its
              descendants have an inconsistent value less than or equal
              to `t`, then all its leaf descendants belong to the
              same flat cluster. When no non-singleton cluster meets
              this criterion, every node is assigned to its own
              cluster. (Default)
          ``distance`` :
              Forms flat clusters so that the original
              observations in each flat cluster have no greater a
              cophenetic distance than `t`.
          ``maxclust`` :
              Finds a minimum threshold ``r`` so that
              the cophenetic distance between any two original
              observations in the same flat cluster is no more than
              ``r`` and no more than `t` flat clusters are formed.
          ``monocrit`` :
              Forms a flat cluster from a cluster node c
              with index i when ``monocrit[j] <= t``.
              For example, to threshold on the maximum mean distance
              as computed in the inconsistency matrix R with a
              threshold of 0.8 do::
                  MR = maxRstat(Z, R, 3)
                  cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
    n_jobs : int, Default 1
        The number of cpu cores used to compute the MPDist.
    
    Returns
    -------
    clusters : dict
        Clustering statistics, distances and labels.
        
        >>> {
        >>>     pairwise_distances: MPDist between pairs of time series as 
        >>>                         np.ndarray,
        >>>     linkage_matrix: clustering linkage matrix as np.ndarray,
        >>>     inconsistency_statistics: inconsistency stats as np.ndarray,
        >>>     assignments: cluster label associated with input X location as
        >>>                  np.ndarray,
        >>>     cophenet: float the cophenet statistic,
        >>>     cophenet_distances: cophenet distances between pairs of time 
        >>>                         series as np.ndarray
        >>>     class: hclusters
        >>> }
    """
    # valid SciPy clustering options to work with custom distance metric
    valid_methods = set(['single', 'complete', 'average', 'weighted'])
    valid_criterions = set(
        ['inconsistent', 'distance', 'monocrit', 'maxclust'])
    method = method.lower()
    criterion = criterion.lower()

    # error handling
    if not core.is_array_like(X):
        raise ValueError('X must be array like!')

    if not isinstance(t, (float, int)):
        raise ValueError('t must be a scalar (int or float)')

    if not isinstance(threshold, float) or threshold <= 0 or threshold >= 1:
        raise ValueError('threshold must be a float greater than 0 and less'\
            ' than 1')

    if not isinstance(depth, int) or depth < 1:
        raise ValueError('depth must be an integer greater than 0')

    if method not in valid_methods:
        opts_str = ', '.join(valid_methods)
        raise ValueError('method may only be one of: ' + opts_str)

    if criterion not in valid_criterions:
        opts_str = ', '.join(valid_criterions)
        raise ValueError('criterion may only be one of: ' + opts_str)

    Y = pairwise_dist(X, window_size, threshold=threshold, n_jobs=n_jobs)
    Z = linkage(Y, method=method)
    R = inconsistent(Z, d=depth)
    c, coph_dists = cophenet(Z, Y)
    T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)

    return {
        'pairwise_distances': Y,
        'linkage_matrix': Z,
        'inconsistency_statistics': R,
        'assignments': T,
        'cophenet': c,
        'cophenet_distances': coph_dists,
        'class': 'hclusters'
    }
Ejemplo n.º 27
0
    def cluster_students_by_responses(self, cluster_by='correct_incorrect', min_joint_questions=10, fcluster_criteria=0.8,
                                          one_section_questions_only=True, load_raw_data=False,
                                          load_processed_data=True,
                                          processed_data_file_name='events_table_math_processed_first_attempt.csv',
                                          load_similarity_matrix=False, is_plot=True):

        """
        :param cluster_by:
            'sum_joint_questions' - cluster students by the questions they answered regardless of response
            'correct_incorrect' - cluster by the number of errors divided by the number of joint responses (questions answered by both students)
            'percent_joint_errors' - cluster by the percent of joint errors (questions both students answered incorrectly) which are joint for both students

        :param min_joint_questions: minimal number of questions both students answered in order to calculate distance
        :param load_raw_data:
        :param load_processed_data:
        :param processed_data_file_name:
        :param load_similarity_matrix:
        :param is_plot:
        :return:
        """
        # ---- load events data and meta data
        EA.load_data(load_raw_data=load_raw_data, load_processed_data=load_processed_data,
                     processed_data_file_name=processed_data_file_name)

        data = EA.processed_event_data
        md = EA.meta_data.set_index('sElementID')

        # ----- define similarity matrix properties:
        if cluster_by == 'sum_joint_questions':
            data_column = 'is_correct_response' # which columns from raw data to use
            mask_type = 'bool' # apply mask over data columns
            metric_type = 'hamming'  # type of distance for pairwise dist for linkage calculation
            linkage_type = 'complete' #linkage for pairwise clustering
            fillna = None

        elif cluster_by == 'correct_incorrect':
            data_column = 'is_correct_response'
            mask_type = 'nan_to_zero'
            metric_type = 'jaccard_intersection'  # todo consider normalized measures\ give higher score to mistakes.
            linkage_type = 'average'
            distance_normalization='min_max'
            fillna = 'max_and_std'

        elif cluster_by == 'percent_joint_errors': # todo - define this and check.
            data_column = 'is_correct_response'
            mask_type = 'nan_to_zero' #replace 0 to -1 and nan to zero
            metric_type = 'joint_minus_percent_drop0'
            linkage_type = 'average'
            fillna = 'max'
            distance_normalization = None

        elif cluster_by == 'num_joint_errors': # todo - define this and check.
            data_column = 'is_correct_response'
            mask_type = 'nan_to_zero'
            # calcs the mean correct\incorrect response over joiint questions
            metric_type = 'joint_minus_count_drop0'
            linkage_type = 'average'

            distance_normalization='min_max'
            fillna = 'max'

        metric_name = metric_type
        if metric_type not in dir(distance):
            metric_type = data_handle.define_distance_metric(distance_name=metric_type,
                                                             min_intersection=min_joint_questions)

        #self.clustering_details = utils.make_dict_from_locals(locals(), keys=
            #['data_column', 'mask_type', 'metric_type', 'fillna', 'linkage_type'])  # the column used for clustering


        if one_section_questions_only:
            data = data.loc[data.n_sections == 1]
        # ---- load similarity matrix according to metric name
        if load_similarity_matrix:
            students_similarity_matrix = df.from_csv('temp_similarity_matrix_%s_%s_metric_normalized_%s.csv' % (mask_type, metric_name,str(distance_normalization)))
            if students_similarity_matrix.shape[1]==1:
                students_similarity_matrix=students_similarity_matrix.T.values[0]
            elif students_similarity_matrix.shape[1]>1:
                students_similarity_matrix=distance.squareform(students_similarity_matrix.values)
            processed_students_responses=df.from_csv('%s_students_responses.csv' % mask_type)


        else:

            process_for_figure = True

            students_responses=data_handle.pivot(data, index_col='student_id',columns_col='question_id', values_col=data_column,
                                   agg_function='first',convert_to_numeric=True)


            #-------------------------------
            processed_students_responses = data_handle.mask_data(students_responses, type=mask_type)
            print(processed_students_responses)
            #students_similarity_matrix10 = distance.pdist(processed_students_responses.head(10), metric=metric_type)
            students_similarity_matrix=distance.pdist(processed_students_responses, metric=metric_type)
            df(distance.squareform(students_similarity_matrix), index=processed_students_responses.index,
               columns=processed_students_responses.index).to_csv(
                'temp_similarity_matrix_%s_%s_metric_NOT_Normalized.csv' % (mask_type, metric_name))
            students_similarity_matrix=data_handle.normalize_data(students_similarity_matrix,min_value=0, by=distance_normalization, fillna=fillna)
            studetns_similarity_matrix=students_similarity_matrix[0]
            df(distance.squareform(students_similarity_matrix), index=processed_students_responses.index,
               columns=processed_students_responses.index).to_csv(
                'temp_similarity_matrix_%s_%s_metric_normalized_min_max.csv' % (mask_type, metric_name))
            df(processed_students_responses).to_csv('%s_students_responses.csv' % mask_type)



        #def hierarhical_clustering(smilarity_matrix, linkage_method):
        #students_similarity_matrix=df.from_csv('temp_similarity_matrix_100.csv')
        Z=linkage(students_similarity_matrix, method=linkage_type)
        Z_df=df(Z,columns=['obs1','obs2','distance','n_in_cluster'])
        c=cophenet(Z,students_similarity_matrix)
        print('cophenet=%f for linkage==%s  and distance==%s' %(c[0],linkage_type, metric_name))
        inconsistency= df(inconsistent(Z,), columns=['mean_link','std_link','n_links','inconsistence_coeff'])
        students_clusters=pd.Series(fcluster(Z, fcluster_criteria))#5, criterion='maxclust')
        print(students_clusters.value_counts())
        cluster_responses=processed_students_responses.copy()


        cluster_responses['cluster']=students_clusters
        cluster_responses.sort_values('cluster', inplace=True)
        questions_count_by_cluster=df(columns=set(students_clusters))

        #show the dendrogram next to the feature matrix to check if it makes sense
        if is_plot:
            cr=cluster_responses.T.apply(pd.to_numeric).drop('cluster')
            f,axes=plt.subplots(2,1,sharex=True)
            plt.subplot(211)
            plt.title('clustering by students response %s' %cluster_by)
            plt.ylabel('distance')

            dendrogram(Z,no_labels=True, color_threshold=0.7*max(Z[:,2]))
            plt.subplot(212)
            cr.index=md.loc[cr.index]['question_index'].drop_duplicates()
            cr.sort_index(inplace=True)
            #cr=cr.applymap(lambda x : np.nan if x==0 else x).dropna(how='all',axis=0)

            plt.pcolor(cr)

            plt.ylabel('question')
            plt.xlabel('student')
            #plt.plot(kind='bar')
            plt.savefig('heatmap_and_dendrogram_%s.png' % cluster_by)
            plt.close()

        self.students_clusters=students_clusters
        self.clustered_students_responses=cluster_responses
        self.students_similarity_matrix=students_similarity_matrix
        return students_clusters, cluster_responses, students_similarity_matrix
Ejemplo n.º 28
0
def get_clusters_Hierarchy_clustering(x, hier_dict):
    #default value
    L_method = 'single'
    L_metric = 'euclidean'
    t = 0.9
    criterionH = 'inconsistent'
    depth = 2
    R = None
    colR = 3
    #L_metric can be 'braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’,
    #‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’,
    # ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’,
    #‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’,
    #‘sokalsneath’, ‘sqeuclidean’
    #**Note that ‘jensenshannon’,‘yule’may result in a condensed distance matrix which contains infinite value
    if 'L_metric' in hier_dict.keys():
        L_metric = hier_dict['L_metric']

# L_method can be 'single', 'complete','average','weighted','centroid','median','ward'
    if 'L_method' in hier_dict.keys():
        L_method = hier_dict['L_method']
    if L_method == 'centroid' or L_method == 'median' or L_method == 'ward':
        if L_metric != 'euclidean':
            L_metric = 'euclidean'
            print('\n')
            print('*************Note:**************')
            print('Method ' + str(L_method) +
                  ' requires the distance metric to be Euclidean')

    if 'optimal_ordering' in hier_dict.keys():
        optimal_ordering = hier_dict['optimal_ordering']
    else:
        optimal_ordering = False
    Z = linkage(x,
                method=L_method,
                metric=L_metric,
                optimal_ordering=optimal_ordering)
    #criterion can be
    if 'criterionH' in hier_dict.keys():
        criterionH = hier_dict['criterionH']
    else:
        criterionH = 'inconsistent'
    if 'depth' in hier_dict.keys():
        depth = hier_dict['depth']
    else:
        depth = 2
    if 't' in hier_dict.keys():
        t = hier_dict['t']
        #for 'maxclust' or 'maxclust_monocrit' criteria,
        #t would be max number of clusters requested.
    elif criterionH == 'maxclust_monocrit' or criterionH == 'maxclust':
        t = 20

    if 'R' in hier_dict.keys():
        R = hier_dict['R']
    if criterionH == 'inconsistent' or criterionH == 'maxclust_monocrit':
        #The inconsistency matrix to use for the 'inconsistent' criterion.
        #R is computed if not provided.
        if R is None:
            R = inconsistent(Z, d=depth)
        else:
            R = np.asarray(R, order='c')
    if criterionH == 'monocrit':
        if R is None:
            R = inconsistent(Z, d=depth)
            #colR  is the column of 'R' to use as the statistic
        return fcluster(Z,
                        criterion='monocrit',
                        t=t,
                        monocrit=maxRstat(Z, R, colR))
    elif criterionH == 'maxclust_monocrit':
        return fcluster(Z,
                        criterion='maxclust_monocrit',
                        t=t,
                        monocrit=maxinconsts(Z, R))
    else:
        return fcluster(Z, criterion=criterionH, depth=depth, R=R, t=t)
fancy_dendrogram(
    Z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,
    max_d=16,
)
plt.show()

from scipy.cluster.hierarchy import inconsistent

depth = 5
incons = inconsistent(Z, depth)
print(incons[-10:])

depth = 3
incons = inconsistent(Z, depth)
print(incons[-10:])

last = Z[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)

acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
Ejemplo n.º 30
0
# the vertical lines indicate the distance at which the labels were merged

plt.figure(figsize=(25, 10))
plt.title('Hierachical Clustering Dendrogram (full)')
plt.xlabel('sample clusters')
plt.ylabel('distance')

dendrogram(Z, leaf_rotation=90., leaf_font_size=8.,)

plt.show()

# truncate the dendrogram for better visibility
# with only the last p=12 merged clusters,
# and labeled with cluster's inconsistency coefficient

R = inconsistent(Z)  # Question 4:   1/2 param: id cluster fusionné; 3param: distance point; 4param nombre de points

# truncate the dendrogram for better visibility
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    truncate_mode='lastp',  # to explain
    p=12,  # to explain
    show_leaf_counts=False,  # otherwise numbers in brackets are counts
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  # to get a distribution impression in truncated branches
    )
plt.show()
Ejemplo n.º 31
0
 def inconsistency(self):
     depth = 3
     incons = inconsistent(self.linked, depth)
     return incons[-15:]
Ejemplo n.º 32
0
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels):
    """
    Generate silhoutte score based on hierarchical clustering.

    Args:
        dendroMatrix: list, occurance of words in different files
        distance_metric: string, style of distance metric in the dendrogram
        linkage_method: string, style of linkage method in the dendrogram
        labels: list, file names

    Returns:
        silhouetteScore: string, containing the result of silhouette score 
        silhouetteAnnotation: string, annotation of the silhouette score
        score: float, silhouette score
        inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion 
        maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion
        distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion
        distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion
        monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion
        monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion
        threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank
    """
    activeFiles = len(labels) - 1
    if (activeFiles > 2):  # since "number of lables should be more than 2 and less than n_samples - 1"
        Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric)
        Z = hierarchy.linkage(Y, method=linkage_method)

        monocrit = None

        # 'maxclust' range
        maxclustMax = len(labels) - 1

        # 'incosistent' range
        R = hierarchy.inconsistent(Z, 2)
        inconsistentMax = R[-1][-1]
        slen = len('%.*f' % (2, inconsistentMax))
        inconsistentMax = float(str(inconsistentMax)[:slen])

        # 'distance' range
        d = hierarchy.cophenet(Z)
        distanceMax = d.max()
        slen = len('%.*f' % (2, distanceMax))
        distanceMax = float(str(distanceMax)[:slen])
        distanceMin = d.min() + 0.01
        slen = len('%.*f' % (2, distanceMin))
        distanceMin = float(str(distanceMin)[:slen])

        # 'monocrit' range
        MR = hierarchy.maxRstat(Z, R, 0)
        monocritMax = MR.max()
        slen = len('%.*f' % (2, monocritMax))
        monocritMax = float(str(monocritMax)[:slen])
        monocritMin = MR.min() + 0.01
        slen = len('%.*f' % (2, monocritMin))
        monocritMin = float(str(monocritMin)[:slen])

        threshold = request.form['threshold']
        if threshold == '':
            threshold = str(threshold)
        else:
            threshold = float(threshold)

        if request.form['criterion'] == 'maxclust':
            criterion = 'maxclust'
            if (threshold == '') or (threshold > maxclustMax):
                threshold = len(labels) - 1
            else:
                threshold = round(float(threshold))
        elif request.form['criterion'] == 'distance':
            criterion = 'distance'
            if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin):
                threshold = distanceMax
        elif request.form['criterion'] == 'inconsistent':
            criterion = 'inconsistent'
            if (threshold == '') or (threshold > inconsistentMax):
                threshold = inconsistentMax
        elif request.form['criterion'] == 'monocrit':
            criterion = 'monocrit'
            monocrit = MR
            if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin):
                threshold = monocritMax
        scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit)

        if len(set(scoreLabel)) <= 1:  # this means all the files are divided into only 1 or less cluster
            silhouetteScore = "Silhouette Score: invalid for only 1 cluster."
            silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster"
            score = 'invalid for only 1 cluster'
            inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A'
        else:
            score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed')
            score = round(score, constants.ROUND_DIGIT)
            inequality = '≤'.decode('utf-8')
            silhouetteScore = "Silhouette Score: " + str(
                score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)"
            silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar."

    else:
        silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files."
        silhouetteAnnotation = ""
        score = 'invalid for less than or equal to 2 files.'
        threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A'


    return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
Ejemplo n.º 33
0
 def check_inconsistent_q_single(self, depth):
     X = eo['Q-X']
     Z = linkage(X, 'single', 'euclidean')
     R = inconsistent(Z, depth)
     Rright = eo['inconsistent-Q-single-%d' % depth]
     assert_allclose(R, Rright, atol=1e-05)
Ejemplo n.º 34
0
dendrogram_tune(z,
                truncate_mode='lastp',
                p=12,
                leaf_rotation=90.,
                leaf_font_size=12.,
                show_contracted=True,
                annotate_above=10,
                max_d=20)
plt.show()
'''Metodo de corte automatico del dendrograma
inconsistency:i = (h_i-avg(h_i))/std(h_i)
donde i es cada union de clusters (linea horizontal)
'''
depth = 5  #cantidad de clusters abajo para realizar la inconsistencia
incons = inconsistent(z, depth)
incons[-10:]
#no funciona por que no hay una distribucion normal entonces depende mucho de la profundidad especificada
#muy probablemente la union a analizar se considere un outlier con respecto a las anteriores
#cada union toma mas distancia en realizarse
'''El metodo del codo'''

last = z[-10:, 2]
last_rev = last[::
                -1]  #los revertimos, se toman todos y el -1 indica que el ultimo pasa a ser el primero
acc = np.diff(
    last, 2
)  #de dos en dos, osea el primero con el segundo, el segundo con el tercero, este con el cuarto y asi
acc_rev = acc[::-1]
idx = np.arange(1, len(last) + 1)
plt.plot(idx, last_rev)
Ejemplo n.º 35
0
 def check_inconsistent_tdist(self, method, depth, atol):
     Y = squareform(_tdist)
     Z = linkage(Y, method)
     R = inconsistent(Z, depth)
     Rright = eo['inconsistent-%s-tdist-depth-%d' % (method, depth)]
     assert_allclose(R, Rright, atol=atol)
Ejemplo n.º 36
0
Archivo: VCF.py Proyecto: melakbet/MAVR
    def hierarchical_clustering(self, method='average', dendrogramm_max_y=2000,
                                sample_name=None, save=False, clustering_dir="clustering",
                                dendrogramm_color_threshold=1000,
                                draw_dendrogramm=True,
                                write_inconsistent=True,
                                write_correlation=True):
        # IMPORTANT! Use only for one-sample vcf
        # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
        region_dict = self._split_regions()
        positions_dict = OrderedDict({})
        correlation_dict = OrderedDict({})
        linkage_dict = OrderedDict({})
        inconsistent_dict = OrderedDict({})
        clusters_dict = OrderedDict({})
        if draw_dendrogramm or write_correlation or write_inconsistent:
            os.system("mkdir -p %s" % clustering_dir)
        for region in region_dict:
            positions_dict[region] = np.array([[record.pos] for record in region_dict[region]])

            # allowed methods(used to calculate distance between clusters):
            # 'complete'    -   Farthest Point Algorithm
            # 'single'      -   Nearest Point Algorithm
            # 'average'     -   UPGMA algorithm, distance between clusters is calculated as average from pairwise
            #                   distances between elements of clusters
            # 'weighted     -   WPGMA algorithm
            # 'centroid'    -   UPGMC algorithm
            # 'median'      -   WPGMC algorithm
            # 'ward'        -   incremental algorithm

            distance_matrix = pdist(positions_dict[region])
            #print(distance_matrix)
            linkage_dict[region] = linkage(distance_matrix, method=method)
            if draw_dendrogramm:
                plt.figure(1, dpi=150, figsize=(50, 20))
                dendrogram(linkage_dict[region],
                           color_threshold=dendrogramm_color_threshold,
                           leaf_font_size=4,
                           distance_sort=True)
                plt.ylim(ymax=dendrogramm_max_y)
                plt.axhline(y=500, color="purple")
                plt.axhline(y=1000, color="black")
                plt.savefig("%s/clustering_%s.svg" % (clustering_dir, region))
                plt.close()

            # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cophenet.html#scipy.cluster.hierarchy.cophenet
            # calculates cophenetic correlation coefficient to estimate accuracy of clustering
            correlation_dict[region] = cophenet(linkage_dict[region], distance_matrix)[0]

            # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.inconsistent.html#scipy.cluster.hierarchy.inconsistent
            # calculates inconsistent coeff

            inconsistent_dict[region] = inconsistent(linkage_dict[region])
            if write_inconsistent:
                np.savetxt("%s/inconsistent_coefficient_%s.t" % (clustering_dir, region), inconsistent_dict[region])

            #clusters_dict[region] = fcluster(linkage_dict[region], 1)
            #np.savetxt("clustering/clusters_%s.t" % region, clusters_dict[region], fmt="%i")
        if write_correlation:
            sample = sample_name
            if not sample:
                sample = self.samples[0]
            with open("%s/correlation.t" % clustering_dir, "w") as cor_fd:
                cor_fd.write("sample\t%s\n" % ("\t".join(list(region_dict.keys()))))
                cor_fd.write("%s\t%s\n" % (sample, "\t".join([str(correlation_dict[region]) for region in region_dict])))

        if save:
            self.linkage_dict = linkage_dict

        return region_dict, linkage_dict
Ejemplo n.º 37
0
basis of inconsistency coefficient. This measures the ratio between the height
of a link and the average height of its direct children links. A group with a
low inconsistency is a group where the parent and children link heights are
similar. It can also be said that this group reflects real similarity between
children. The depth of the inconsistency cuttoff tells us how many levels for
which the cutoff applies for each group.
"""

# Then compute the linkages (average represents the MRP dataset best)
Z = linkage(dist_vec, method='average')

# Compute cophenetic correlation distance between Z and flat dist_mat
[c, d] = cophenet(Z, Y=dist_vec)

# Compute the inconsistency matrix for non-singleton cluster (d=2)
R = inconsistent(Z)
# Now compute the maximum inconsistency coefficient per Cluster
MI = maxinconsts(Z, R)

# Cluster the events based on inconsistency threshold of 1.2
indices = fcluster(Z, t=1.0, criterion='inconsistent')

# Visualize the distribution of correlation values
samp_inds = numpy.random.random_integers(0, len(dist_vec), 10000)
samp_corrs = []
for ind in samp_inds:
    samp_corrs.append(dist_vec[ind])

# Plot the dendrogram...if it's not way too huge
dendrogram(Z, color_threshold=1 - corr_thresh, distance_sort='ascending')
plt.show()
Ejemplo n.º 38
0
 def check_inconsistent_tdist(self, depth):
     Z = hierarchy_test_data.linkage_ytdist_single
     assert_allclose(inconsistent(Z, depth),
                     hierarchy_test_data.inconsistent_ytdist[depth])
def _cluster(df):

    flow_df = df.copy()
    sites = df['Site'].to_list()
    sites_len = len(sites)

    df = df.fillna(0).drop(columns=["Site", "Flow"])
    df = df.to_numpy()

    try:
        distance = dtw.distance_matrix_fast(df, compact=True)
    except Exception as e:
            print('Distance calculation failed, shoudnt continue')
            exit(99)

    distance_ssd = ssd.squareform(distance)

    # Hierarchical clustering - linkage matrix Z
    Z = linkage(distance_ssd, "average")

    # Inconsistent matrix - has mean-distance, standard dev's for each linkage
    IN = inconsistent(Z)

    # Creating a temporary data-frame to extract clusters from linkage and inconsistent matrices
    cols = ['pt1', 'pt2', 'dist', 'tot_pts', 'mean_dist', 'SD_dist', 'cls_level', 'co_eff']
    temp_df = pd.DataFrame(np.hstack([Z, IN]), columns=cols)

    # get the bin's - only using the range from the first level clustering distances
    # Further clustering level will increase linkages' mean distance
    # points that fall above first level mean-distances are deemed as outliers
    cls_level_1_distances = temp_df.loc[temp_df['cls_level'] == 1, 'mean_dist']
    q1, q3 = np.percentile(cls_level_1_distances, [25, 75])
    IQR = q3 - q1

    # Handy formula to calculate bin width - to make sure bin counts are minimal but represents the spread well
    bw = 2 * IQR/ int(round(sites_len ** (1. / 3))) * BIN_FACTOR

    bins_ = (np.arange(min(cls_level_1_distances)- 0.1, max(cls_level_1_distances) + bw, bw))

    # hierarchical clustering groups data till it reaches a single cluster that has all data points
    # we don't need rows from linkage matrix, which represents higher level clustering,
    # keeping link rows only the leaf_nodes (i.e single site data point)
    temp_df = temp_df[(temp_df['pt1'] < sites_len) | (temp_df['pt2'] < sites_len) ]

    # apply the bins
    temp_df['bins'] = pd.cut(temp_df['mean_dist'], bins_ ).astype('str')

    # Map digits to intervals , for readability
    map_dict = {str(value):counter for counter, value in enumerate(temp_df['bins'].unique()) if value != 'nan'}
    temp_df['Cluster'] = temp_df['bins'].map(map_dict)

    # NaNs are the outliers , treat them as singleton cluster, giving name to each NAN
    total_nans = (temp_df['Cluster'].isna().sum())
    temp_df.loc[temp_df['Cluster'].isna(), 'Cluster'] = [ 'O' + str(i) for i in range(1,total_nans+1) ]

    # Combine linkage matrix columns - to create a single column view of, site vs cluster mapping
    df1 = temp_df.loc[temp_df['pt1'] < sites_len, ['pt1', 'Cluster']].rename(columns={'pt1':'Site'}).copy()
    df2 = temp_df.loc[temp_df['pt2'] < sites_len, ['pt2', 'Cluster']].rename(columns={'pt2':'Site'}).copy()
    temp_df = pd.concat([df1, df2]).sort_values(by='Site').reset_index(drop=True)
    flow_df['Cluster'] = temp_df['Cluster']

    # # visualizing
    # sites_n = [(str(site) + '-' + str(i)) for site, i in enumerate(sites)]
    # fig, ax = plt.subplots()
    # fig.set_size_inches(20,40)
    # dend = dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=sites_n, ax=ax)
    # plt.show()
    return flow_df