def flatcluster( dRow, runLogs, interClusterDistance="complete", plotDendrogram=True, cMethod="inconsistent", cValue=2.5 ): # if 'inter-cluster distance' in clusterSetup.keys(): # method = clusterSetup['inter-cluster distance'] # else: # method = 'complete' z = linkage(dRow, interClusterDistance) inc = inconsistent(z) # print inc if plotDendrogram: plotdendrogram(z) clusters = fcluster(z, cValue, cMethod) noClusters = max(clusters) print("Total number of clusters:", noClusters) for i in range(noClusters): counter = 0 for j in range(len(clusters)): if clusters[j] == (i + 1): counter += 1 print("Cluster", str(i + 1), ":", str(counter)) global clusterCount clusterCount = noClusters print(len(clusters)) print(len(runLogs)) for i, log in enumerate(runLogs): log[0]["Cluster"] = str(clusters[i]) return z, clusters, runLogs
def _run_hier_clust_on_centroids(self,method='average'): ''' runs hierarchical clustering based on the centroids of the data per scipy's methods ''' uniqueLabels = np.sort(np.unique(self.templateLabels)) centroids = np.array([self.templateMat[np.where(self.templateLabels == i)[0],:].mean(axis=0) for i in uniqueLabels]) self.y = pdist(centroids) self.z = hierarchy.linkage(self.y,method) r2 = hierarchy.inconsistent(self.z,2) ## rank the average of linkage hieghts by standard deviation the report the averages meanHeights = r2[:,0] stdHeights = r2[:,1] rankedInds = np.argsort(stdHeights)[::-1] bestCutPoints = meanHeights[rankedInds] ## save centroid labels for all cuts of the dentragram allCentroidLabels = {} rankedK = [] for cp in bestCutPoints: centroidLabels = hierarchy.fcluster(self.z,t=cp,criterion='distance') k = len(np.unique(centroidLabels)) if allCentroidLabels.has_key(str(k)) == True: continue allCentroidLabels[str(k)] = centroidLabels rankedK.append(k) centroidLabels = allCentroidLabels[str(rankedK[0])] ## save the top xx modes self.bestModeLabels = [] print 'doing ranking...' for rk in rankedK[:25]: centroidLabels = allCentroidLabels[str(rk)] modeLabels = self._get_mode_labels(self.templateLabels,centroidLabels,uniqueLabels) self.bestModeLabels.append(modeLabels) ## provide silvalue ranks in case we wish to reorder the top xx modes by sil value self.modeSilValues = [] self.modeSizes = [] allEvents = [self.templateData] for count in range(len(self.bestModeLabels)): numClusters = np.unique(self.bestModeLabels[count]).size silValues = get_silhouette_values(allEvents,[self.bestModeLabels[count]],subsample=self.noiseSample, minNumEvents=5000,resultsType='raw') silMean = silValues['0'].mean() self.modeSilValues.append(silValues['0'].mean()) self.modeSizes.append(numClusters) silValues = get_silhouette_values(allEvents,[self.templateLabels],subsample=self.noiseSample, minNumEvents=5000,resultsType='raw') self.clusterSilValues = silValues['0'].mean() self.modeSilValues = np.array(self.modeSilValues) self.modeSizes = np.array(self.modeSizes)
def check_maxRstat_Q_linkage(self, method, i): # Tests maxRstat(Z, R, i) on the Q data set X = hierarchy_test_data.Q_X Z = linkage(X, method) R = inconsistent(Z) MD = maxRstat(Z, R, 1) expectedMD = calculate_maximum_inconsistencies(Z, R, 1) assert_allclose(MD, expectedMD, atol=1e-15)
def test_is_valid_im_4_and_up(self): # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15 # (step size 3). for i in xrange(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) R = inconsistent(Z) assert_(is_valid_im(R) == True)
def check_maxRstat_Q_linkage(self, method, i): # Tests maxRstat(Z, R, i) on the Q data set X = eo['Q-X'] Y = pdist(X) Z = linkage(X, method) R = inconsistent(Z) MD = maxRstat(Z, R, 1) expectedMD = calculate_maximum_inconsistencies(Z, R, 1) assert_allclose(MD, expectedMD, atol=1e-15)
def test_is_valid_im_4_and_up_neg_dist(self): # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15 # (step size 3) with negative link counts. for i in range(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) R = inconsistent(Z) R[i//2,2] = -0.5 assert_(is_valid_im(R) == False) assert_raises(ValueError, is_valid_im, R, throw=True)
def test_is_valid_im_4_and_up_neg_dist(self): # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15 # (step size 3) with negative link counts. for i in xrange(4, 15, 3): y = np.random.rand(i*(i-1)//2) Z = linkage(y) R = inconsistent(Z) R[i//2,2] = -0.5 assert_(is_valid_im(R) == False) assert_raises(ValueError, is_valid_im, R, throw=True)
def test_is_valid_im_4_and_up_neg_index_right(self): # Tests is_valid_im(R) on im on observation sets between sizes 4 and 15 # (step size 3) with negative link height standard deviations. for i in xrange(4, 15, 3): y = np.random.rand(i * (i - 1) // 2) Z = linkage(y) R = inconsistent(Z) R[i // 2, 1] = -2.0 assert_(is_valid_im(R) == False) assert_raises(ValueError, is_valid_im, R, throw=True)
def inconsist(Z, depth): from scipy.cluster.hierarchy import inconsistent incons = inconsistent(Z, depth) # optional , depth) # link statistics # print incons[-40:][5] # [-10:] # print max(incons[:][5]) # [-10:] # print min(incons[:][5]) # [-10:] # print incons#[:][5].transpose() print incons[:, 3] #.transpose() print max(incons[:, 3]) #.transpose() print min(incons[:, 3]) #.transpose()
def Hierarchy(V, **kwargs): """Performs hierarchical clustering on *V*. The function essentially uses two scipy functions: ``linkage`` and ``fcluster``. See :func:`scipy.cluster.hierarchy.linkage` and :func:`scipy.cluster.hierarchy.fcluster` for the explaination of the arguments. Here lists arguments that are different from those of scipy. :arg V: row-normalized eigenvectors for the purpose of clustering. :type V: :class:`numpy.ndarray` :arg inconsistent_percentile: if the clustering *criterion* for :func:`scipy.cluster.hierarchy.fcluster` is ``inconsistent`` and threshold *t* is not given (default), then the function will use the percentile specified by this argument as the threshold. :type inconsistent_percentile: double :arg n_clusters: specifies the maximal number of clusters. If this argument is given, then the function will automatically set *criterion* to ``maxclust`` and *t* equal to *n_clusters*. :type n_clusters: int """ try: from scipy.cluster.hierarchy import linkage, fcluster, inconsistent except ImportError: raise ImportError('Use of this function (Hierarchy) requires the ' 'installation of scipy.') method = kwargs.pop('method', 'single') metric = kwargs.pop('metric', 'euclidean') Z = linkage(V, method=method, metric=metric) criterion = kwargs.pop('criterion', 'inconsistent') t = kwargs.get('t', None) ip = kwargs.pop('inconsistent_percentile', 99.9) if t is None and criterion == 'inconsistent': I = inconsistent(Z) i = np.percentile(I[:, 3], ip) t = kwargs.pop('t', i) depth = kwargs.pop('depth', 2) R = kwargs.pop('R', None) monocrit = kwargs.pop('monocrit', None) n_clusters = kwargs.pop('n_clusters', None) if n_clusters is not None: criterion = 'maxclust' t = n_clusters labels = fcluster(Z, t, criterion=criterion, depth=depth, R=R, monocrit=monocrit) return labels.flatten()
def flatcluster(dMatrix, clusterSetup): dRow = prepareDRow(dMatrix) print dRow # Checking user-specified options, if there is any. Otherwise the default # values are assigned if 'inter-cluster distance' in clusterSetup.keys(): method = clusterSetup['inter-cluster distance'] else: method = 'complete' z = linkage(dRow, method) inc = inconsistent(z) print inc if 'plotDendrogram?' in clusterSetup.keys(): if clusterSetup['plotDendrogram?']: plotdendrogram(z) else: pass else: plotdendrogram(z) if 'cutoff criteria' in clusterSetup.keys(): cmethod = clusterSetup['cutoff criteria'] else: cmethod = 'inconsistent' if 'cutoff criteria value' in clusterSetup.keys(): cvalue = clusterSetup['cutoff criteria value'] else: cvalue = 2.5 clusters = fcluster(z, cvalue, cmethod) noClusters = max(clusters) print 'Total number of clusters:', noClusters for i in range(noClusters): counter = 0 for j in range(len(clusters)): if clusters[j]==(i+1): counter+=1 print "Cluster",str(i+1),":",str(counter) for runIndex in range(len(clusters)): global runLogs runLogs[runIndex][0]['Cluster'] = str(clusters[runIndex]) global clusterCount if clusters[runIndex] > clusterCount: clusterCount = clusters[runIndex] return clusters
def do_hcluster(linkMe,metric = ['euclidean','cityblock','correlation'],method = ['single','complete','average','centroid','median','ward']): # linkMe: m x n DataFrame; m: sample no. n: time point # metric: metrics to try # method: methods to try writeMe = [] # do linkage coph = {} inconsist = {} links = {} for mtc in metric: for mtd in method: if (mtd=='centroid' or mtd=='median' or mtd=='ward') and not(mtc=='euclidean'): continue # write to file writeMe.append('\n\n---------------------------------------------------------------------------------------------------------------------\n' + mtc + ' ' + mtd + '\n---------------------------------------------------------------------------------------------------------------------\n') if not(mtc in links): links[mtc] = {mtd:hier.linkage(linkMe,method=mtd,metric=mtc,optimal_ordering=True)} coph[mtc] = {mtd: hier.cophenet(links[mtc][mtd],pdist(linkMe))} inconsist[mtc] = {mtd: hier.inconsistent(links[mtc][mtd])} else: links[mtc].update({mtd:hier.linkage(linkMe,method=mtd,metric=mtc,optimal_ordering=True)}) coph[mtc].update({mtd: hier.cophenet(links[mtc][mtd],pdist(linkMe))}) inconsist[mtc].update({mtd: hier.inconsistent(links[mtc][mtd])}) # append diagnostics writeMe.append('Inconsistency Matrix\n') writeMe.append(np.array2string(inconsist[mtc][mtd])) writeMe.append('\n\nCophenetic distance\n') writeMe.append(np.array2string(squareform(coph[mtc][mtd][1]))) writeMe.append('\n\nCophenetic Correlation Coefficient: ') writeMe.append(np.array2string(coph[mtc][mtd][0])) return links, coph, inconsist, writeMe
def Hierarchy(V, **kwargs): """Performs hierarchical clustering on *V*. The function essentially uses two scipy functions: ``linkage`` and ``fcluster``. See :func:`scipy.cluster.hierarchy.linkage` and :func:`scipy.cluster.hierarchy.fcluster` for the explaination of the arguments. Here lists arguments that are different from those of scipy. :arg V: row-normalized eigenvectors for the purpose of clustering. :type V: :class:`numpy.ndarray` :arg inconsistent_percentile: if the clustering *criterion* for :func:`scipy.cluster.hierarchy.fcluster` is ``inconsistent`` and threshold *t* is not given (default), then the function will use the percentile specified by this argument as the threshold. :type inconsistent_percentile: double :arg n_clusters: specifies the maximal number of clusters. If this argument is given, then the function will automatically set *criterion* to ``maxclust`` and *t* equal to *n_clusters*. :type n_clusters: int """ from scipy.cluster.hierarchy import linkage, fcluster, inconsistent method = kwargs.pop('method', 'single') metric = kwargs.pop('metric', 'euclidean') Z = linkage(V, method=method, metric=metric) criterion = kwargs.pop('criterion', 'inconsistent') t = kwargs.get('t', None) ip = kwargs.pop('inconsistent_percentile', 99.9) if t is None and criterion == 'inconsistent': I = inconsistent(Z) i = np.percentile(I[:,3], ip) t = kwargs.pop('t', i) depth = kwargs.pop('depth', 2) R = kwargs.pop('R', None) monocrit = kwargs.pop('monocrit', None) n_clusters = kwargs.pop('n_clusters', None) if n_clusters is not None: criterion = 'maxclust' t = n_clusters labels = fcluster(Z, t, criterion=criterion, depth=depth, R=R, monocrit=monocrit) return labels.flatten()
def fclusterdata(X, t, criterion='distance', metric='euclidean', depth=2, method='single', R=None): """ This is adapted from scipy fclusterdata. https://github.com/scipy/scipy/blob/v1.0.0/scipy/cluster/hierarchy.py#L1809-L1878 """ X = np.asarray(X, order='c', dtype=np.double) if type(X) != np.ndarray or len(X.shape) != 2: print(type(X), X.shape) raise TypeError('The observation matrix X must be an n by m numpy ' 'array.') Y = distance.pdist(X, metric=metric) Z = linkage(Y, method=method) if R is None: R = inconsistent(Z, d=depth) else: R = np.asarray(R, order='c') T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t) return T, Z
def sax_hcluster(linkMe,method=['single','complete','average']): # linkMe is a compressed vector of minimum distances writeMe = [] coph = {} inconsist = {} links = {} for mtd in method: writeMe.append('\n\n---------------------------------------------------------------------------------------------------------------------\n' + mtd + '\n---------------------------------------------------------------------------------------------------------------------\n') links[mtd] = hier.linkage(linkMe,method=mtd,optimal_ordering=True) coph[mtd] = hier.cophenet(links[mtd],linkMe) inconsist[mtd] = hier.inconsistent(links[mtd]) # append diagnostics writeMe.append('Inconsistency Matrix\n') writeMe.append(np.array2string(inconsist[mtd])) writeMe.append('\n\nCophenetic distance\n') writeMe.append(np.array2string(squareform(coph[mtd][1]))) writeMe.append('\n\nCophenetic Correlation Coefficient: ') writeMe.append(np.array2string(coph[mtd][0])) return links, coph, inconsist, writeMe
def flatcluster(dRow, runLogs, interClusterDistance='complete', plotDendrogram=True, cMethod='inconsistent', cValue=2.5): # if 'inter-cluster distance' in clusterSetup.keys(): # method = clusterSetup['inter-cluster distance'] # else: # method = 'complete' z = linkage(dRow, interClusterDistance) inc = inconsistent(z) #print inc if plotDendrogram: plotdendrogram(z) clusters = fcluster(z, cValue, cMethod) noClusters = max(clusters) print('Total number of clusters:', noClusters) for i in range(noClusters): counter = 0 for j in range(len(clusters)): if clusters[j] == (i + 1): counter += 1 print("Cluster", str(i + 1), ":", str(counter)) global clusterCount clusterCount = noClusters print(len(clusters)) print(len(runLogs)) for i, log in enumerate(runLogs): log[0]['Cluster'] = str(clusters[i]) return z, clusters, runLogs
def check_inconsistent_tdist(self, depth): Z = hierarchy_test_data.linkage_ytdist_single assert_allclose(inconsistent(Z, depth), hierarchy_test_data.inconsistent_ytdist[depth])
from scipy.cluster.hierarchy import inconsistent, linkage from matplotlib import pyplot as plt X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]] Z = linkage(X, 'ward') print(Z) # [[ 5. 6. 0. 2. ] # [ 2. 7. 0. 2. ] # [ 0. 4. 1. 2. ] # [ 1. 8. 1.15470054 3. ] # [ 9. 10. 2.12132034 4. ] # [ 3. 12. 4.11096096 5. ] # [11. 13. 14.07183949 8. ]] inconsistent(Z) # array([[ 0. , 0. , 1. , 0. ], # [ 0. , 0. , 1. , 0. ], # [ 1. , 0. , 1. , 0. ], # [ 0.57735027, 0.81649658, 2. , 0.70710678], # [ 1.04044011, 1.06123822, 3. , 1.01850858], # [ 3.11614065, 1.40688837, 2. , 0.70710678], # [ 6.44583366, 6.76770586, 3. , 1.12682288]])
def _get_inconsistent(distances: numpy.ndarray) -> pandas.DataFrame: inconsistent = hierarchy.inconsistent(distances, 10) inc = pandas.DataFrame(inconsistent) inc.columns = ["mean", "std", "count", "coefficient"] return inc
centroids = np.array([case1[np.where(case1Labels == i)[0],:].mean(axis=0) for i in uniqueLabels]) fig = plt.figure() ax = fig.add_subplot(111) ncluster = 27 y = pdist(centroids) method = 'centroid'#'average' z = hierarchy.linkage(y,'average') #t = hierarchy.fcluster(27,criterion='maxclust') ## computes the max distance between any cluster and ea non singleton cluster print 'max dists', hierarchy.maxdists(z) ## inconsistancy r = hierarchy.inconsistent(z) print 'r',r #print 'max inconsts', hierarchy.maxinconsts(z,r,i) print 'z',z #print 'blah', z[:,2] - np.array(z[1:,2].tolist()+[0]) print z[:,2] print np.hstack([z[1:,2],[0]]) levelDiffs = np.abs(z[:,2] - np.hstack([z[1:,2],[0]])) levelDiffMeans = z[:,2]# - 0.001 #np.hstack([z[1:,2],[0]]) / 2.0#z[:,2] + np.hstack([z[1:,2],[0]]) / 2.0 print 'diffs',levelDiffs*100 diffInds = np.argsort(levelDiffs) diffInds = diffInds[::-1] print 'a',levelDiffMeans[diffInds[:6]] #,levelDiffMeans[diffInds[1]],levelDiffMeans[diffInds[2]],levelDiffMeans[diffInds[3]] print 'b', diffInds[:5] hierarchy.dendrogram(z)
# plt.title('Hierarchical Clustering Dendrogram (truncated)') # plt.xlabel('sample index or (cluster size)') # plt.ylabel('distance') # dendrogram( # Z, # truncate_mode='lastp', # show only the last p merged clusters # p=12, # show only the last p merged clusters # leaf_rotation=90., # leaf_font_size=12., # show_contracted=True, # to get a distribution impression in truncated branches # ) # plt.show() # c, coph_dists = cophenet(Z, BCD_matrix) # print "Cophenetic Correlation Coefficient:",c # return Z def Elbow(Z,lastp=10): last = Z[-lastp:, 2] last_rev = last[::-1] idxs = np.arange(1, len(last) + 1) plt.plot(idxs, last_rev) acceleration = np.diff(last, 2) # 2nd derivative of the distances acceleration_rev = acceleration[::-1] plt.plot(idxs[:-2] + 1, acceleration_rev) plt.show() k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters print "clusters:", k Elbow(Z,lastp=10) incons = inconsistent(Z, 5) fcluster(Z, 6, criterion='maxclust')
# generate two clusters: a with 100 points, b with 50: np.random.seed(4711) # for repeatability of this tutorial a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,]) b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,]) X = np.concatenate((a, b),) print X.shape # 150 samples with 2 dimensions #plt.scatter(X[:,0], X[:,1]) #plt.show() # generate the linkage matrix Z = linkage(X, 'ward') print Z.shape depth = 5 incons = inconsistent(Z, depth) print incons[-10:] last = Z[-10:, 2] last_rev = last[::-1] idxs = np.arange(1, len(last) + 1) plt.plot(idxs, last_rev) acceleration = np.diff(last, 2) # 2nd derivative of the distances acceleration_rev = acceleration[::-1] plt.plot(idxs[:-2] + 1, acceleration_rev) plt.show() k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters print "clusters:", k
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels): """ Generate silhoutte score based on hierarchical clustering. Args: dendroMatrix: list, occurance of words in different files distance_metric: string, style of distance metric in the dendrogram linkage_method: string, style of linkage method in the dendrogram labels: list, file names Returns: silhouetteScore: string, containing the result of silhouette score silhouetteAnnotation: string, annotation of the silhouette score score: float, silhouette score inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank """ activeFiles = len(labels) - 1 if ( activeFiles > 2 ): # since "number of lables should be more than 2 and less than n_samples - 1" Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric) Z = hierarchy.linkage(Y, method=linkage_method) monocrit = None # 'maxclust' range maxclustMax = len(labels) - 1 # 'incosistent' range R = hierarchy.inconsistent(Z, 2) inconsistentMax = R[-1][-1] slen = len('%.*f' % (2, inconsistentMax)) inconsistentMax = float(str(inconsistentMax)[:slen]) # 'distance' range d = hierarchy.cophenet(Z) distanceMax = d.max() slen = len('%.*f' % (2, distanceMax)) distanceMax = float(str(distanceMax)[:slen]) distanceMin = d.min() + 0.01 slen = len('%.*f' % (2, distanceMin)) distanceMin = float(str(distanceMin)[:slen]) # 'monocrit' range MR = hierarchy.maxRstat(Z, R, 0) monocritMax = MR.max() slen = len('%.*f' % (2, monocritMax)) monocritMax = float(str(monocritMax)[:slen]) monocritMin = MR.min() + 0.01 slen = len('%.*f' % (2, monocritMin)) monocritMin = float(str(monocritMin)[:slen]) threshold = request.form['threshold'] if threshold == '': threshold = str(threshold) else: threshold = float(threshold) if request.form['criterion'] == 'maxclust': criterion = 'maxclust' if (threshold == '') or (threshold > maxclustMax): threshold = len(labels) - 1 else: threshold = round(float(threshold)) elif request.form['criterion'] == 'distance': criterion = 'distance' if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin): threshold = distanceMax elif request.form['criterion'] == 'inconsistent': criterion = 'inconsistent' if (threshold == '') or (threshold > inconsistentMax): threshold = inconsistentMax elif request.form['criterion'] == 'monocrit': criterion = 'monocrit' monocrit = MR if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin): threshold = monocritMax scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit) if len( set(scoreLabel) ) <= 1: # this means all the files are divided into only 1 or less cluster silhouetteScore = "Silhouette Score: invalid for only 1 cluster." silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster" score = 'invalid for only 1 cluster' inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A' else: score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed') score = round(score, constants.ROUND_DIGIT) inequality = '≤'.decode('utf-8') silhouetteScore = "Silhouette Score: " + str( score ) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)" silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar." else: silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files." silhouetteAnnotation = "" score = 'invalid for less than or equal to 2 files.' threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A' return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
def compute(self): # ------------------- hole self.output_h={} # compute hole correlation matrix self.output_h['COR'] = self.calc_corr_matrix(self.OmFrag,userows=False,usecols=True) # compute distance matrix DIS = self.calc_dist_matrix(self.output_h['COR']) self.output_h['DIS'] = numpy.copy(DIS) # perform clustering Z=squareform( DIS, checks=False ) Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] ) cc,C=sch.cophenet(Y,Z) I=sch.inconsistent(Y) self.output_h['dendro'] = Y self.output_h['cophco'] = cc self.output_h['incons'] = I # analyze clusters self.output_h['cluster'], self.output_h['maxd']=self.find_clustering(Y) # ------------------- electron self.output_e={} # compute electron correlation matrix self.output_e['COR'] = self.calc_corr_matrix(self.OmFrag,userows=True,usecols=False) # compute distance matrix DIS = self.calc_dist_matrix(self.output_e['COR']) self.output_e['DIS'] = numpy.copy(DIS) # perform clustering Z=squareform( DIS, checks=False ) Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] ) cc,C=sch.cophenet(Y,Z) I=sch.inconsistent(Y) self.output_e['dendro'] = Y self.output_e['cophco'] = cc self.output_e['incons'] = I # analyze clusters self.output_e['cluster'], self.output_e['maxd']=self.find_clustering(Y) # ------------------- mixed self.output_m={} # compute mixed correlation matrix (maximum of hole and electron) self.output_m['COR'] = numpy.where(self.output_e['COR']>=self.output_h['COR'], self.output_e['COR'], self.output_h['COR']) # compute distance matrix DIS = self.calc_dist_matrix(self.output_m['COR']) self.output_m['DIS'] = numpy.copy(DIS) # perform clustering Z=squareform( DIS, checks=False ) Y=sch.linkage( Z, method=self.opt_dict['link_scheme'] ) cc,C=sch.cophenet(Y,Z) I=sch.inconsistent(Y) self.output_m['dendro'] = Y self.output_m['cophco'] = cc self.output_m['incons'] = I # analyze clusters self.output_m['cluster'], self.output_m['maxd']=self.find_clustering(Y)
]) b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[ 50, ]) X = np.concatenate((a, b), ) print X.shape # 150 samples with 2 dimensions #plt.scatter(X[:,0], X[:,1]) #plt.show() # generate the linkage matrix Z = linkage(X, 'ward') print Z.shape depth = 5 incons = inconsistent(Z, depth) print incons[-10:] last = Z[-10:, 2] last_rev = last[::-1] idxs = np.arange(1, len(last) + 1) plt.plot(idxs, last_rev) acceleration = np.diff(last, 2) # 2nd derivative of the distances acceleration_rev = acceleration[::-1] plt.plot(idxs[:-2] + 1, acceleration_rev) plt.show() k = acceleration_rev.argmax( ) + 2 # if idx 0 is the max of this we want 2 clusters print "clusters:", k
def hierarchical_clusters(X, window_size, t, threshold=0.05, method='single', depth=2, criterion='distance', n_jobs=1): """ Cluster M time series into hierarchical clusters using agglomerative approach. This function is more or less a convenience wrapper around SciPy's scipy.cluster.hierarchy functions, but uses the MPDist algorithm to compute distances between each pair of time series. Note ---- Memory usage could potentially high depending on the length of your time series and how many distances are computed! Parameters ---------- X : array_like An M x N matrix where M is the time series and N is the observations at a given time. window_size : int The window size used to compute the MPDist. t : scalar For criteria 'inconsistent', 'distance' or 'monocrit', this is the threshold to apply when forming flat clusters. For 'maxclust' criteria, this would be max number of clusters requested. threshold : float, Default 0.05 The percentile in which the MPDist is taken from. By default it is set to 0.05 based on empircal research results from the paper. Generally, you should not change this unless you know what you are doing! This value must be a float greater than 0 and less than 1. method : str, Default single The linkage algorithm to use. Options: {single, complete, average, weighted} depth : int, Default 2 A non-negative value more than 0 to specify the number of levels below a non-singleton cluster to allow. criterion : str, Default distance Options: {inconsistent, distance, maxclust, monocrit} The criterion to use in forming flat clusters. ``inconsistent`` : If a cluster node and all its descendants have an inconsistent value less than or equal to `t`, then all its leaf descendants belong to the same flat cluster. When no non-singleton cluster meets this criterion, every node is assigned to its own cluster. (Default) ``distance`` : Forms flat clusters so that the original observations in each flat cluster have no greater a cophenetic distance than `t`. ``maxclust`` : Finds a minimum threshold ``r`` so that the cophenetic distance between any two original observations in the same flat cluster is no more than ``r`` and no more than `t` flat clusters are formed. ``monocrit`` : Forms a flat cluster from a cluster node c with index i when ``monocrit[j] <= t``. For example, to threshold on the maximum mean distance as computed in the inconsistency matrix R with a threshold of 0.8 do:: MR = maxRstat(Z, R, 3) cluster(Z, t=0.8, criterion='monocrit', monocrit=MR) n_jobs : int, Default 1 The number of cpu cores used to compute the MPDist. Returns ------- clusters : dict Clustering statistics, distances and labels. >>> { >>> pairwise_distances: MPDist between pairs of time series as >>> np.ndarray, >>> linkage_matrix: clustering linkage matrix as np.ndarray, >>> inconsistency_statistics: inconsistency stats as np.ndarray, >>> assignments: cluster label associated with input X location as >>> np.ndarray, >>> cophenet: float the cophenet statistic, >>> cophenet_distances: cophenet distances between pairs of time >>> series as np.ndarray >>> class: hclusters >>> } """ # valid SciPy clustering options to work with custom distance metric valid_methods = set(['single', 'complete', 'average', 'weighted']) valid_criterions = set( ['inconsistent', 'distance', 'monocrit', 'maxclust']) method = method.lower() criterion = criterion.lower() # error handling if not core.is_array_like(X): raise ValueError('X must be array like!') if not isinstance(t, (float, int)): raise ValueError('t must be a scalar (int or float)') if not isinstance(threshold, float) or threshold <= 0 or threshold >= 1: raise ValueError('threshold must be a float greater than 0 and less'\ ' than 1') if not isinstance(depth, int) or depth < 1: raise ValueError('depth must be an integer greater than 0') if method not in valid_methods: opts_str = ', '.join(valid_methods) raise ValueError('method may only be one of: ' + opts_str) if criterion not in valid_criterions: opts_str = ', '.join(valid_criterions) raise ValueError('criterion may only be one of: ' + opts_str) Y = pairwise_dist(X, window_size, threshold=threshold, n_jobs=n_jobs) Z = linkage(Y, method=method) R = inconsistent(Z, d=depth) c, coph_dists = cophenet(Z, Y) T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t) return { 'pairwise_distances': Y, 'linkage_matrix': Z, 'inconsistency_statistics': R, 'assignments': T, 'cophenet': c, 'cophenet_distances': coph_dists, 'class': 'hclusters' }
def cluster_students_by_responses(self, cluster_by='correct_incorrect', min_joint_questions=10, fcluster_criteria=0.8, one_section_questions_only=True, load_raw_data=False, load_processed_data=True, processed_data_file_name='events_table_math_processed_first_attempt.csv', load_similarity_matrix=False, is_plot=True): """ :param cluster_by: 'sum_joint_questions' - cluster students by the questions they answered regardless of response 'correct_incorrect' - cluster by the number of errors divided by the number of joint responses (questions answered by both students) 'percent_joint_errors' - cluster by the percent of joint errors (questions both students answered incorrectly) which are joint for both students :param min_joint_questions: minimal number of questions both students answered in order to calculate distance :param load_raw_data: :param load_processed_data: :param processed_data_file_name: :param load_similarity_matrix: :param is_plot: :return: """ # ---- load events data and meta data EA.load_data(load_raw_data=load_raw_data, load_processed_data=load_processed_data, processed_data_file_name=processed_data_file_name) data = EA.processed_event_data md = EA.meta_data.set_index('sElementID') # ----- define similarity matrix properties: if cluster_by == 'sum_joint_questions': data_column = 'is_correct_response' # which columns from raw data to use mask_type = 'bool' # apply mask over data columns metric_type = 'hamming' # type of distance for pairwise dist for linkage calculation linkage_type = 'complete' #linkage for pairwise clustering fillna = None elif cluster_by == 'correct_incorrect': data_column = 'is_correct_response' mask_type = 'nan_to_zero' metric_type = 'jaccard_intersection' # todo consider normalized measures\ give higher score to mistakes. linkage_type = 'average' distance_normalization='min_max' fillna = 'max_and_std' elif cluster_by == 'percent_joint_errors': # todo - define this and check. data_column = 'is_correct_response' mask_type = 'nan_to_zero' #replace 0 to -1 and nan to zero metric_type = 'joint_minus_percent_drop0' linkage_type = 'average' fillna = 'max' distance_normalization = None elif cluster_by == 'num_joint_errors': # todo - define this and check. data_column = 'is_correct_response' mask_type = 'nan_to_zero' # calcs the mean correct\incorrect response over joiint questions metric_type = 'joint_minus_count_drop0' linkage_type = 'average' distance_normalization='min_max' fillna = 'max' metric_name = metric_type if metric_type not in dir(distance): metric_type = data_handle.define_distance_metric(distance_name=metric_type, min_intersection=min_joint_questions) #self.clustering_details = utils.make_dict_from_locals(locals(), keys= #['data_column', 'mask_type', 'metric_type', 'fillna', 'linkage_type']) # the column used for clustering if one_section_questions_only: data = data.loc[data.n_sections == 1] # ---- load similarity matrix according to metric name if load_similarity_matrix: students_similarity_matrix = df.from_csv('temp_similarity_matrix_%s_%s_metric_normalized_%s.csv' % (mask_type, metric_name,str(distance_normalization))) if students_similarity_matrix.shape[1]==1: students_similarity_matrix=students_similarity_matrix.T.values[0] elif students_similarity_matrix.shape[1]>1: students_similarity_matrix=distance.squareform(students_similarity_matrix.values) processed_students_responses=df.from_csv('%s_students_responses.csv' % mask_type) else: process_for_figure = True students_responses=data_handle.pivot(data, index_col='student_id',columns_col='question_id', values_col=data_column, agg_function='first',convert_to_numeric=True) #------------------------------- processed_students_responses = data_handle.mask_data(students_responses, type=mask_type) print(processed_students_responses) #students_similarity_matrix10 = distance.pdist(processed_students_responses.head(10), metric=metric_type) students_similarity_matrix=distance.pdist(processed_students_responses, metric=metric_type) df(distance.squareform(students_similarity_matrix), index=processed_students_responses.index, columns=processed_students_responses.index).to_csv( 'temp_similarity_matrix_%s_%s_metric_NOT_Normalized.csv' % (mask_type, metric_name)) students_similarity_matrix=data_handle.normalize_data(students_similarity_matrix,min_value=0, by=distance_normalization, fillna=fillna) studetns_similarity_matrix=students_similarity_matrix[0] df(distance.squareform(students_similarity_matrix), index=processed_students_responses.index, columns=processed_students_responses.index).to_csv( 'temp_similarity_matrix_%s_%s_metric_normalized_min_max.csv' % (mask_type, metric_name)) df(processed_students_responses).to_csv('%s_students_responses.csv' % mask_type) #def hierarhical_clustering(smilarity_matrix, linkage_method): #students_similarity_matrix=df.from_csv('temp_similarity_matrix_100.csv') Z=linkage(students_similarity_matrix, method=linkage_type) Z_df=df(Z,columns=['obs1','obs2','distance','n_in_cluster']) c=cophenet(Z,students_similarity_matrix) print('cophenet=%f for linkage==%s and distance==%s' %(c[0],linkage_type, metric_name)) inconsistency= df(inconsistent(Z,), columns=['mean_link','std_link','n_links','inconsistence_coeff']) students_clusters=pd.Series(fcluster(Z, fcluster_criteria))#5, criterion='maxclust') print(students_clusters.value_counts()) cluster_responses=processed_students_responses.copy() cluster_responses['cluster']=students_clusters cluster_responses.sort_values('cluster', inplace=True) questions_count_by_cluster=df(columns=set(students_clusters)) #show the dendrogram next to the feature matrix to check if it makes sense if is_plot: cr=cluster_responses.T.apply(pd.to_numeric).drop('cluster') f,axes=plt.subplots(2,1,sharex=True) plt.subplot(211) plt.title('clustering by students response %s' %cluster_by) plt.ylabel('distance') dendrogram(Z,no_labels=True, color_threshold=0.7*max(Z[:,2])) plt.subplot(212) cr.index=md.loc[cr.index]['question_index'].drop_duplicates() cr.sort_index(inplace=True) #cr=cr.applymap(lambda x : np.nan if x==0 else x).dropna(how='all',axis=0) plt.pcolor(cr) plt.ylabel('question') plt.xlabel('student') #plt.plot(kind='bar') plt.savefig('heatmap_and_dendrogram_%s.png' % cluster_by) plt.close() self.students_clusters=students_clusters self.clustered_students_responses=cluster_responses self.students_similarity_matrix=students_similarity_matrix return students_clusters, cluster_responses, students_similarity_matrix
def get_clusters_Hierarchy_clustering(x, hier_dict): #default value L_method = 'single' L_metric = 'euclidean' t = 0.9 criterionH = 'inconsistent' depth = 2 R = None colR = 3 #L_metric can be 'braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, #‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, # ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, #‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, #‘sokalsneath’, ‘sqeuclidean’ #**Note that ‘jensenshannon’,‘yule’may result in a condensed distance matrix which contains infinite value if 'L_metric' in hier_dict.keys(): L_metric = hier_dict['L_metric'] # L_method can be 'single', 'complete','average','weighted','centroid','median','ward' if 'L_method' in hier_dict.keys(): L_method = hier_dict['L_method'] if L_method == 'centroid' or L_method == 'median' or L_method == 'ward': if L_metric != 'euclidean': L_metric = 'euclidean' print('\n') print('*************Note:**************') print('Method ' + str(L_method) + ' requires the distance metric to be Euclidean') if 'optimal_ordering' in hier_dict.keys(): optimal_ordering = hier_dict['optimal_ordering'] else: optimal_ordering = False Z = linkage(x, method=L_method, metric=L_metric, optimal_ordering=optimal_ordering) #criterion can be if 'criterionH' in hier_dict.keys(): criterionH = hier_dict['criterionH'] else: criterionH = 'inconsistent' if 'depth' in hier_dict.keys(): depth = hier_dict['depth'] else: depth = 2 if 't' in hier_dict.keys(): t = hier_dict['t'] #for 'maxclust' or 'maxclust_monocrit' criteria, #t would be max number of clusters requested. elif criterionH == 'maxclust_monocrit' or criterionH == 'maxclust': t = 20 if 'R' in hier_dict.keys(): R = hier_dict['R'] if criterionH == 'inconsistent' or criterionH == 'maxclust_monocrit': #The inconsistency matrix to use for the 'inconsistent' criterion. #R is computed if not provided. if R is None: R = inconsistent(Z, d=depth) else: R = np.asarray(R, order='c') if criterionH == 'monocrit': if R is None: R = inconsistent(Z, d=depth) #colR is the column of 'R' to use as the statistic return fcluster(Z, criterion='monocrit', t=t, monocrit=maxRstat(Z, R, colR)) elif criterionH == 'maxclust_monocrit': return fcluster(Z, criterion='maxclust_monocrit', t=t, monocrit=maxinconsts(Z, R)) else: return fcluster(Z, criterion=criterionH, depth=depth, R=R, t=t)
fancy_dendrogram( Z, truncate_mode='lastp', p=12, leaf_rotation=90., leaf_font_size=12., show_contracted=True, annotate_above=10, max_d=16, ) plt.show() from scipy.cluster.hierarchy import inconsistent depth = 5 incons = inconsistent(Z, depth) print(incons[-10:]) depth = 3 incons = inconsistent(Z, depth) print(incons[-10:]) last = Z[-10:, 2] last_rev = last[::-1] idxs = np.arange(1, len(last) + 1) plt.plot(idxs, last_rev) acceleration = np.diff(last, 2) # 2nd derivative of the distances acceleration_rev = acceleration[::-1] plt.plot(idxs[:-2] + 1, acceleration_rev) plt.show()
# the vertical lines indicate the distance at which the labels were merged plt.figure(figsize=(25, 10)) plt.title('Hierachical Clustering Dendrogram (full)') plt.xlabel('sample clusters') plt.ylabel('distance') dendrogram(Z, leaf_rotation=90., leaf_font_size=8.,) plt.show() # truncate the dendrogram for better visibility # with only the last p=12 merged clusters, # and labeled with cluster's inconsistency coefficient R = inconsistent(Z) # Question 4: 1/2 param: id cluster fusionné; 3param: distance point; 4param nombre de points # truncate the dendrogram for better visibility plt.title('Hierarchical Clustering Dendrogram (truncated)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, truncate_mode='lastp', # to explain p=12, # to explain show_leaf_counts=False, # otherwise numbers in brackets are counts leaf_rotation=90., leaf_font_size=12., show_contracted=True, # to get a distribution impression in truncated branches ) plt.show()
def inconsistency(self): depth = 3 incons = inconsistent(self.linked, depth) return incons[-15:]
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels): """ Generate silhoutte score based on hierarchical clustering. Args: dendroMatrix: list, occurance of words in different files distance_metric: string, style of distance metric in the dendrogram linkage_method: string, style of linkage method in the dendrogram labels: list, file names Returns: silhouetteScore: string, containing the result of silhouette score silhouetteAnnotation: string, annotation of the silhouette score score: float, silhouette score inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank """ activeFiles = len(labels) - 1 if (activeFiles > 2): # since "number of lables should be more than 2 and less than n_samples - 1" Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric) Z = hierarchy.linkage(Y, method=linkage_method) monocrit = None # 'maxclust' range maxclustMax = len(labels) - 1 # 'incosistent' range R = hierarchy.inconsistent(Z, 2) inconsistentMax = R[-1][-1] slen = len('%.*f' % (2, inconsistentMax)) inconsistentMax = float(str(inconsistentMax)[:slen]) # 'distance' range d = hierarchy.cophenet(Z) distanceMax = d.max() slen = len('%.*f' % (2, distanceMax)) distanceMax = float(str(distanceMax)[:slen]) distanceMin = d.min() + 0.01 slen = len('%.*f' % (2, distanceMin)) distanceMin = float(str(distanceMin)[:slen]) # 'monocrit' range MR = hierarchy.maxRstat(Z, R, 0) monocritMax = MR.max() slen = len('%.*f' % (2, monocritMax)) monocritMax = float(str(monocritMax)[:slen]) monocritMin = MR.min() + 0.01 slen = len('%.*f' % (2, monocritMin)) monocritMin = float(str(monocritMin)[:slen]) threshold = request.form['threshold'] if threshold == '': threshold = str(threshold) else: threshold = float(threshold) if request.form['criterion'] == 'maxclust': criterion = 'maxclust' if (threshold == '') or (threshold > maxclustMax): threshold = len(labels) - 1 else: threshold = round(float(threshold)) elif request.form['criterion'] == 'distance': criterion = 'distance' if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin): threshold = distanceMax elif request.form['criterion'] == 'inconsistent': criterion = 'inconsistent' if (threshold == '') or (threshold > inconsistentMax): threshold = inconsistentMax elif request.form['criterion'] == 'monocrit': criterion = 'monocrit' monocrit = MR if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin): threshold = monocritMax scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit) if len(set(scoreLabel)) <= 1: # this means all the files are divided into only 1 or less cluster silhouetteScore = "Silhouette Score: invalid for only 1 cluster." silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster" score = 'invalid for only 1 cluster' inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A' else: score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed') score = round(score, constants.ROUND_DIGIT) inequality = '≤'.decode('utf-8') silhouetteScore = "Silhouette Score: " + str( score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)" silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar." else: silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files." silhouetteAnnotation = "" score = 'invalid for less than or equal to 2 files.' threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A' return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
def check_inconsistent_q_single(self, depth): X = eo['Q-X'] Z = linkage(X, 'single', 'euclidean') R = inconsistent(Z, depth) Rright = eo['inconsistent-Q-single-%d' % depth] assert_allclose(R, Rright, atol=1e-05)
dendrogram_tune(z, truncate_mode='lastp', p=12, leaf_rotation=90., leaf_font_size=12., show_contracted=True, annotate_above=10, max_d=20) plt.show() '''Metodo de corte automatico del dendrograma inconsistency:i = (h_i-avg(h_i))/std(h_i) donde i es cada union de clusters (linea horizontal) ''' depth = 5 #cantidad de clusters abajo para realizar la inconsistencia incons = inconsistent(z, depth) incons[-10:] #no funciona por que no hay una distribucion normal entonces depende mucho de la profundidad especificada #muy probablemente la union a analizar se considere un outlier con respecto a las anteriores #cada union toma mas distancia en realizarse '''El metodo del codo''' last = z[-10:, 2] last_rev = last[:: -1] #los revertimos, se toman todos y el -1 indica que el ultimo pasa a ser el primero acc = np.diff( last, 2 ) #de dos en dos, osea el primero con el segundo, el segundo con el tercero, este con el cuarto y asi acc_rev = acc[::-1] idx = np.arange(1, len(last) + 1) plt.plot(idx, last_rev)
def check_inconsistent_tdist(self, method, depth, atol): Y = squareform(_tdist) Z = linkage(Y, method) R = inconsistent(Z, depth) Rright = eo['inconsistent-%s-tdist-depth-%d' % (method, depth)] assert_allclose(R, Rright, atol=atol)
def hierarchical_clustering(self, method='average', dendrogramm_max_y=2000, sample_name=None, save=False, clustering_dir="clustering", dendrogramm_color_threshold=1000, draw_dendrogramm=True, write_inconsistent=True, write_correlation=True): # IMPORTANT! Use only for one-sample vcf # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage region_dict = self._split_regions() positions_dict = OrderedDict({}) correlation_dict = OrderedDict({}) linkage_dict = OrderedDict({}) inconsistent_dict = OrderedDict({}) clusters_dict = OrderedDict({}) if draw_dendrogramm or write_correlation or write_inconsistent: os.system("mkdir -p %s" % clustering_dir) for region in region_dict: positions_dict[region] = np.array([[record.pos] for record in region_dict[region]]) # allowed methods(used to calculate distance between clusters): # 'complete' - Farthest Point Algorithm # 'single' - Nearest Point Algorithm # 'average' - UPGMA algorithm, distance between clusters is calculated as average from pairwise # distances between elements of clusters # 'weighted - WPGMA algorithm # 'centroid' - UPGMC algorithm # 'median' - WPGMC algorithm # 'ward' - incremental algorithm distance_matrix = pdist(positions_dict[region]) #print(distance_matrix) linkage_dict[region] = linkage(distance_matrix, method=method) if draw_dendrogramm: plt.figure(1, dpi=150, figsize=(50, 20)) dendrogram(linkage_dict[region], color_threshold=dendrogramm_color_threshold, leaf_font_size=4, distance_sort=True) plt.ylim(ymax=dendrogramm_max_y) plt.axhline(y=500, color="purple") plt.axhline(y=1000, color="black") plt.savefig("%s/clustering_%s.svg" % (clustering_dir, region)) plt.close() # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cophenet.html#scipy.cluster.hierarchy.cophenet # calculates cophenetic correlation coefficient to estimate accuracy of clustering correlation_dict[region] = cophenet(linkage_dict[region], distance_matrix)[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.inconsistent.html#scipy.cluster.hierarchy.inconsistent # calculates inconsistent coeff inconsistent_dict[region] = inconsistent(linkage_dict[region]) if write_inconsistent: np.savetxt("%s/inconsistent_coefficient_%s.t" % (clustering_dir, region), inconsistent_dict[region]) #clusters_dict[region] = fcluster(linkage_dict[region], 1) #np.savetxt("clustering/clusters_%s.t" % region, clusters_dict[region], fmt="%i") if write_correlation: sample = sample_name if not sample: sample = self.samples[0] with open("%s/correlation.t" % clustering_dir, "w") as cor_fd: cor_fd.write("sample\t%s\n" % ("\t".join(list(region_dict.keys())))) cor_fd.write("%s\t%s\n" % (sample, "\t".join([str(correlation_dict[region]) for region in region_dict]))) if save: self.linkage_dict = linkage_dict return region_dict, linkage_dict
basis of inconsistency coefficient. This measures the ratio between the height of a link and the average height of its direct children links. A group with a low inconsistency is a group where the parent and children link heights are similar. It can also be said that this group reflects real similarity between children. The depth of the inconsistency cuttoff tells us how many levels for which the cutoff applies for each group. """ # Then compute the linkages (average represents the MRP dataset best) Z = linkage(dist_vec, method='average') # Compute cophenetic correlation distance between Z and flat dist_mat [c, d] = cophenet(Z, Y=dist_vec) # Compute the inconsistency matrix for non-singleton cluster (d=2) R = inconsistent(Z) # Now compute the maximum inconsistency coefficient per Cluster MI = maxinconsts(Z, R) # Cluster the events based on inconsistency threshold of 1.2 indices = fcluster(Z, t=1.0, criterion='inconsistent') # Visualize the distribution of correlation values samp_inds = numpy.random.random_integers(0, len(dist_vec), 10000) samp_corrs = [] for ind in samp_inds: samp_corrs.append(dist_vec[ind]) # Plot the dendrogram...if it's not way too huge dendrogram(Z, color_threshold=1 - corr_thresh, distance_sort='ascending') plt.show()
def _cluster(df): flow_df = df.copy() sites = df['Site'].to_list() sites_len = len(sites) df = df.fillna(0).drop(columns=["Site", "Flow"]) df = df.to_numpy() try: distance = dtw.distance_matrix_fast(df, compact=True) except Exception as e: print('Distance calculation failed, shoudnt continue') exit(99) distance_ssd = ssd.squareform(distance) # Hierarchical clustering - linkage matrix Z Z = linkage(distance_ssd, "average") # Inconsistent matrix - has mean-distance, standard dev's for each linkage IN = inconsistent(Z) # Creating a temporary data-frame to extract clusters from linkage and inconsistent matrices cols = ['pt1', 'pt2', 'dist', 'tot_pts', 'mean_dist', 'SD_dist', 'cls_level', 'co_eff'] temp_df = pd.DataFrame(np.hstack([Z, IN]), columns=cols) # get the bin's - only using the range from the first level clustering distances # Further clustering level will increase linkages' mean distance # points that fall above first level mean-distances are deemed as outliers cls_level_1_distances = temp_df.loc[temp_df['cls_level'] == 1, 'mean_dist'] q1, q3 = np.percentile(cls_level_1_distances, [25, 75]) IQR = q3 - q1 # Handy formula to calculate bin width - to make sure bin counts are minimal but represents the spread well bw = 2 * IQR/ int(round(sites_len ** (1. / 3))) * BIN_FACTOR bins_ = (np.arange(min(cls_level_1_distances)- 0.1, max(cls_level_1_distances) + bw, bw)) # hierarchical clustering groups data till it reaches a single cluster that has all data points # we don't need rows from linkage matrix, which represents higher level clustering, # keeping link rows only the leaf_nodes (i.e single site data point) temp_df = temp_df[(temp_df['pt1'] < sites_len) | (temp_df['pt2'] < sites_len) ] # apply the bins temp_df['bins'] = pd.cut(temp_df['mean_dist'], bins_ ).astype('str') # Map digits to intervals , for readability map_dict = {str(value):counter for counter, value in enumerate(temp_df['bins'].unique()) if value != 'nan'} temp_df['Cluster'] = temp_df['bins'].map(map_dict) # NaNs are the outliers , treat them as singleton cluster, giving name to each NAN total_nans = (temp_df['Cluster'].isna().sum()) temp_df.loc[temp_df['Cluster'].isna(), 'Cluster'] = [ 'O' + str(i) for i in range(1,total_nans+1) ] # Combine linkage matrix columns - to create a single column view of, site vs cluster mapping df1 = temp_df.loc[temp_df['pt1'] < sites_len, ['pt1', 'Cluster']].rename(columns={'pt1':'Site'}).copy() df2 = temp_df.loc[temp_df['pt2'] < sites_len, ['pt2', 'Cluster']].rename(columns={'pt2':'Site'}).copy() temp_df = pd.concat([df1, df2]).sort_values(by='Site').reset_index(drop=True) flow_df['Cluster'] = temp_df['Cluster'] # # visualizing # sites_n = [(str(site) + '-' + str(i)) for site, i in enumerate(sites)] # fig, ax = plt.subplots() # fig.set_size_inches(20,40) # dend = dendrogram(Z, leaf_rotation=90, leaf_font_size=8, labels=sites_n, ax=ax) # plt.show() return flow_df