def time_subcluster(self, locs): # Getting subclusters at Mapzen's limit cluster_linkage = linkage(locs, method='ward') clusters = fcluster(cluster_linkage, 50, criterion='maxclust') cluster_means = np.array([np.mean( locs[np.where(clusters == i)], axis=0 ) for i in range(1, 51)]) mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means] mapzen_matrix = self.mapzen_matrix(mapzen_locs) # Cluster labels used for mapping back together # Subtracting one to use 0 index cl = clusters - 1 # Get a matching distance matrix of lat/lon distance, get ratios cluster_km_dist = squareform(pdist(cluster_means, (lambda u,v: haversine(u,v)))) dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix, cluster_km_dist)) # Divide items by mean to normalize a bit dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix, dist_ratio_matrix.mean())) locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v)))) # Iterate through each, updating by ratio in dist_ratio_matrix it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]] it.iternext() return locs_km_dist
def _train(self, trainset): self._dataset = trainset self.ulabels = trainset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf), self._splitter, enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = dist.max( ) - dist # Kind of inversion. High values in confusion -> similar -> small distance dist = (dist + dist.T) / 2 # Distance must be symmetric (property of a norm) dist -= np.diag( np.diag(dist) ) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree( self.tree)[0] self._tree_clf.train(trainset)
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'): # Dataset is a mxn matrix, m is number of item and n is the dimension of data m,n=Dataset.shape Visited=numpy.zeros(m,'int') Type=numpy.zeros(m) # -1 noise, outlier # 0 border # 1 core ClustersList=[] Cluster=[] PointClusterNumber=numpy.zeros(m) PointClusterNumberIndex=1 PointNeighbors=[] DistanceMatrix = hcluster.squareform(hcluster.pdist(Dataset, DistanceMethod)) for i in xrange(m): if Visited[i]==0: Visited[i]=1 PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0] if len(PointNeighbors)<MinumumPoints: Type[i]=-1 else: for k in xrange(len(Cluster)): Cluster.pop() Cluster.append(i) PointClusterNumber[i]=PointClusterNumberIndex PointNeighbors=set2List(PointNeighbors) ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex ) Cluster.append(PointNeighbors[:]) ClustersList.append(Cluster[:]) PointClusterNumberIndex=PointClusterNumberIndex+1 return PointClusterNumber
def optics(x, k, distMethod = 'euclidean'): if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 try: D = H.squareform(H.pdist(x, distMethod)) distOK = True except: print "squareform or pdist error" distOK = False CD = np.zeros(m) RD = np.ones(m)*1E10 for i in xrange(m): #again you can use the euclid function if you don't want hcluster # d = euclid(x[i],x) # d.sort() # CD[i] = d[k] tempInd = D[i].argsort() tempD = D[i][tempInd] # tempD.sort() #we don't use this function as it changes the reference CD[i] = tempD[k]#**2 order = [] seeds = np.arange(m, dtype = np.int) ind = 0 while len(seeds) != 1: # for seed in seeds: ob = seeds[ind] seedInd = np.where(seeds != ob) seeds = seeds[seedInd] order.append(ob) tempX = np.ones(len(seeds))*CD[ob] tempD = D[ob][seeds]#[seeds] #you can use this function if you don't want to use hcluster #tempD = euclid(x[ob],x[seeds]) temp = np.column_stack((tempX, tempD)) mm = np.max(temp, axis = 1) ii = np.where(RD[seeds]>mm)[0] RD[seeds[ii]] = mm[ii] ind = np.argmin(RD[seeds]) order.append(seeds[0]) RD[0] = 0 #we set this point to 0 as it does not get overwritten return RD, CD, order
def pdist(self, X): import hcluster import pylab Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric)) if self.plot: pylab.imshow(Y) pylab.show() yield Y
def cluster(dupes, threshold=.5, max_components=30000): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: (i_to_id, condensed_distances) = condensedDistance(sub_graph) N = max(i_to_id) + 1 linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') clusters = {} for (i, sub_cluster_id) in enumerate(partition): clusters.setdefault(cluster_id + sub_cluster_id, []).append(i) distances = hcluster.squareform(condensed_distances) for cluster_id, items in clusters.iteritems() : if len(items) > 1 : scores = confidences(items, distances) clustering[cluster_id] =\ (tuple(i_to_id[item] for item in items), tuple(scores)) cluster_id += max(partition) + 1 else: ids, score = sub_graph[0] clustering[cluster_id] = (tuple(ids), tuple([score]*2)) cluster_id += 1 return clustering.values()
def MVU_slack(datafile, dim = 3): # takes in a pickled matrix of points - outputs a MVU embedding fp = open(datafile) pts = pickle.load(fp) ans = pickle.load(fp) # latent space coordinates size = len(pts) k = len(ans[0]) # the number of latent dimensions # mean center coordinates m = np.mean(pts, axis=0) pts = pts - m # TODO: move graph cluster algorithm to own file - write in C? # compute the distance matrix and cluster Y = hc.squareform(hc.pdist(pts,'euclidean')) res = cluster_graph(Y, fnc = 'k', size = 8) x,y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors # generate data to write problem in SPDA format # TODO: add slack variable block indx = [] for (i,j) in zip(x,y): if i <= j: indx.append((i,j)) m = len(indx) + 1 nblocks = 2 c = [0.0] for (i,j) in indx: c.append(Y[i,j]**2) write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01) # TODO: add some error checking os.system("csdp ../ds/sdp.dat ../ds/sdp.sol") y,Z,X = read_sol_file_slack("../ds/sdp.sol", size) # spectral decomposition of the dual solution (X) u,s,v = la.svd(X) results = [] for i in range(dim): results.append(np.sqrt(s[i]) * u[:,i]) # returns the neighborhood graph for proper plotting return results, pts, res
def optics(x, k, distMethod = 'euclidean'): if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 try: D = H.squareform(H.pdist(x, distMethod)) distOK = True except Exception, ex: print ex print "squareform or pdist error" distOK = False
def MVU_slack(datafile, dim=3): # takes in a pickled matrix of points - outputs a MVU embedding fp = open(datafile) pts = pickle.load(fp) ans = pickle.load(fp) # latent space coordinates size = len(pts) k = len(ans[0]) # the number of latent dimensions # mean center coordinates m = np.mean(pts, axis=0) pts = pts - m # TODO: move graph cluster algorithm to own file - write in C? # compute the distance matrix and cluster Y = hc.squareform(hc.pdist(pts, 'euclidean')) res = cluster_graph(Y, fnc='k', size=8) x, y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors # generate data to write problem in SPDA format # TODO: add slack variable block indx = [] for (i, j) in zip(x, y): if i <= j: indx.append((i, j)) m = len(indx) + 1 nblocks = 2 c = [0.0] for (i, j) in indx: c.append(Y[i, j]**2) write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01) # TODO: add some error checking os.system("csdp ../ds/sdp.dat ../ds/sdp.sol") y, Z, X = read_sol_file_slack("../ds/sdp.sol", size) # spectral decomposition of the dual solution (X) u, s, v = la.svd(X) results = [] for i in range(dim): results.append(np.sqrt(s[i]) * u[:, i]) # returns the neighborhood graph for proper plotting return results, pts, res
def _train(self, dataset): self._dataset = dataset self.ulabels=self._dataset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm) dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0] self._tree_clf.train(self._dataset)
def cluster(self): """Cluster strokes""" # the purpose of this step is to cluster strokes using # the previously calculated distance matrix matrix = numpy.load(self.DTW_DATA) Y = hcluster.squareform(matrix) Z = hcluster.linkage(Y, method=self.CLUSTERING_METHOD) T = hcluster.fcluster(Z, 1.15) clusters = self.get_cluster_dict_from_array(T) if self.verbose: self.print_clusters(clusters) if not os.path.exists(self.CLUSTER_ROOT): os.makedirs(self.CLUSTER_ROOT) pickle.dump(clusters, open(self.CLUSTER_DATA, "w"))
print "\n ___________________________________________\n" import hcluster as H x = [1,2,3,4,5,6,7,8,9,10] print H.squareform(x) print "\n ___________________________________________\n" print H.pdist( [[1],[3] , [5],[2]] ) print "\n ___________________________________________\n" print H.squareform(H.pdist( [[1],[3] , [5],[2]] )) print "\n ___________________________________________\n" print [[1],[3],[5],[2]] print [[1,3] , [5,2]] print H.pdist( [[1,3] , [5,2]] ) print "\n___________________________________________\n" print H.squareform(H.pdist( [ [1,2,3,4] , [3,4,5,6] ,[5,6,7,8] ] ))
y_min, y_max = X[:,1].min()-1, X[:,1].max()+1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) axes.set_cmap(pylab.cm.Paired) pylab.axes(axes) pylab.contourf(xx, yy, Z) pylab.axis('off') # Plot also the training points pylab.scatter(X[:,0], X[:,1], c=Y) plotWindow = plot(show_toolbar=True) fig, axes = plotWindow.get_figure_and_axes(0) plot_decision_surface(axes, svm, data, labels) plotWindow.show() cdm = np.array([d_matrix, d_matrix.T]) cdm = np.max(cdm, axis=0) cdm = cdm - 0.5 cdm[np.identity(cdm.shape[0], dtype=bool)] = 0.0 plotWindow = plot(show_toolbar=True) fig, axes = plotWindow.get_figure_and_axes(0) import hcluster cdm = hcluster.squareform(cdm) Z = hcluster.linkage(cdm, 'average') print_engine.draw_dendrogram(axes, Z, labels=names) plotWindow.show()
def optics_alg(x, k, distMethod='cosine'): #was euclidean import time tic = time.clock() import numpy as N import pylab as P import hcluster as H if len(x.shape) > 1: m, n = x.shape else: m = x.shape[0] n == 1 try: # D = H.squareform(H.pdist(x, distMethod)) from scipy.spatial.distance import pdist D = H.squareform(pdist(x, distMethod)) distOK = True except: print "squareform or pdist error" distOK = False CD = N.zeros(m) RD = N.ones(m) * 1E10 for i in xrange(m): # again you can use the euclid function if you don't want hcluster # d = euclid(x[i],x) # d.sort() # CD[i] = d[k] tempInd = D[i].argsort() tempD = D[i][tempInd] # tempD.sort() #we don't use this function as it changes the reference CD[i] = tempD[k] # **2 order = [] seeds = N.arange(m, dtype=N.int) ind = 0 while len(seeds) != 1: # for seed in seeds: ob = seeds[ind] seedInd = N.where(seeds != ob) seeds = seeds[seedInd] order.append(ob) tempX = N.ones(len(seeds)) * CD[ob] tempD = D[ob][seeds] # [seeds] # you can use this function if you don't want to use hcluster # tempD = euclid(x[ob],x[seeds]) temp = N.column_stack((tempX, tempD)) mm = N.max(temp, axis=1) ii = N.where(RD[seeds] > mm)[0] RD[seeds[ii]] = mm[ii] ind = N.argmin(RD[seeds]) toc = time.clock() res = toc - tic print "Compute time is: ", res order.append(seeds[0]) RD[0] = 0 # we set this point to 0 as it does not get overwritten return RD, CD, order
def dendrogram(self): self.linkage = hcluster.linkage(hcluster.squareform(self.matrix), method="complete")
def cluster(M, method='complete'): return hcluster.linkage(hcluster.squareform(M), method=method)
[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]]) y = pdist(data, metric=metric) Z = linkage(y, method=method, metric=metric) dendrogram(Z) Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z] # cleaning leaves = list(leaves_list(Z)) count = len(leaves) root = len(Z)+count-1 X = squareform(y) assert len(X) == count from utils import memoise # bar-joseph optimal ordering ################################################ from barjoseph import optimal leaves = optimal(root, **{ "S": lambda i, j: X[i][j], "left": lambda i: None if i < count else Z[i-count][0], "right": lambda i: None if i < count else Z[i-count][1], "is_leaf": lambda i: i < count,
def maxclust_dists(dists, k, method = 'complete'): d2 = hcluster.squareform(dists) Z = hcluster.linkage(d2, method = method) fcl = hcluster.fcluster(Z, t = k, criterion = 'maxclust') return fcl
def dbscan(x,k,Eps = None, distMethod = 'euclidean'): ''' Calculate the density based clustering of an array ''' try: m = x.shape[0] if Eps == None: Eps = epsilon(x,k) #need to test if the squareform will fail #squareform makes a large matrix and if the arrays #input are too large not enough memory exists try: dist = H.squareform(H.pdist(x, distMethod)) distOK = True except: distOK = False x = N.column_stack((N.arange(0,m),x)) if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 type = N.zeros(m) touched = N.zeros(m) no = 1 tType = N.zeros(m) cClass = N.zeros(m) if distOK: for i in xrange(m): if touched[i] == 0: ob = x[i] D = dist[ob[0]] # D = euclid(ob[1:],x[:,1:3]) ind = N.where(D<=Eps) ind = set2List(ind)[0] if len(ind)>1 and len(ind)<(k+1): tType[i] = 0 cClass[i] = 0 if len(ind) == 1: tType[i] = -1 cClass[i] = -1 touched[i] = 1 if len(ind) >= k+1: tType[i] = 1 cClass[ind] = N.ones(len(ind))*no for l in ind: ob2 = x[l] touched[l]=1 D2 = dist[ob2[0]] i1 = N.where(D2<=Eps) i1 = set2List(i1)[0] if len(i1) > 1: cClass[i1] = no if len(i1)>=k+1: tType[ob2[0]] = 1 else: tType[ob2[0]] = 0 for j in xrange(len(i1)): if touched[i1[j]] == 0: touched[i1[j]]=1 ind.append(i1[j]) cClass[i1[j]] = no no+=1 else:#this is the very slow way but gets around the memory problem. print "The Input Array is too big and a squareform cannot be computed" raise "MemoryErro" # for i in xrange(m): # if touched[i] == 0: # ob = x[i] # # D = dist[ob[0]] # D = euclid(ob[1:],x[:,1:3]) # ind = N.where(D<=Eps) # ind = set2List(ind)[0] # # if len(ind)>1 and len(ind)<(k+1): # tType[i] = 0 # cClass[i] = 0 # # if len(ind) == 1: # tType[i] = -1 # cClass[i] = -1 # touched[i] = 1 # # if len(ind) >= k+1: # tType[i] = 1 # cClass[ind] = N.ones(len(ind))*no # # for l in ind: # ob2 = x[l] # touched[l]=1 # D2 = euclid(ob2[1:],x[:,1:3]) ## D2 = dist[ob2[0]] # i1 = N.where(D2<=Eps) # i1 = set2List(i1)[0] # if len(i1) > 1: # cClass[i1] = no # if len(i1)>=k+1: # tType[ob2[0]] = 1 # else: # tType[ob2[0]] = 0 # # for j in xrange(len(i1)): # if touched[i1[j]] == 0: # touched[i1[j]]=1 # ind.append(i1[j]) # cClass[i1[j]] = no # # no+=1 i1 = N.where(cClass == 0) i1 = set2List(i1)[0] cClass[i1] = -1 tType[i1] = -1 return cClass, tType, Eps, True except: errorMsg ="An error occured with the DBSCAN Algorithm\n" errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value) print errorMsg return None,None,None,False
def dbscan(x,k,Eps = None, distMethod = 'euclidean'): try: m = x.shape[0] if Eps == None: Eps = epsilon(x,k) dist = H.squareform(H.pdist(x, distMethod)) x = N.column_stack((N.arange(0,m),x)) if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 type = N.zeros(m) touched = N.zeros(m) no = 1 tType = N.zeros(m) cClass = N.zeros(m) for i in range(0,m): if touched[i] == 0: ob = x[i] D = dist[ob[0]] ind = N.where(D<=Eps) ind = set2List(ind)[0] if len(ind)>1 and len(ind)<(k+1): tType[i] = 0 cClass[i] = 0 if len(ind) == 1: tType[i] = -1 cClass[i] = -1 touched[i] = 1 if len(ind) >= k+1: tType[i] = 1 cClass[ind] = N.ones(len(ind))*no for l in ind: ob2 = x[l] touched[l]=1 D2 = dist[ob2[0]] i1 = N.where(D2<=Eps) i1 = set2List(i1)[0] if len(i1) > 1: cClass[i1] = no if len(i1)>=k+1: tType[ob2[0]] = 1 else: tType[ob2[0]] = 0 for j in xrange(len(i1)): if touched[i1[j]] == 0: touched[i1[j]]=1 ind.append(i1[j]) cClass[i1[j]] = no no+=1 i1 = N.where(cClass == 0) i1 = set2List(i1)[0] cClass[i1] = -1 tType[i1] = -1 return cClass, tType, Eps, True except: errorMsg ="An error occured with the DBSCAN Algorithm" errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value) print errorMsg return None,None,None,False
sym_matrix[i][j]=sym_matrix[j][i]=dendropy.treecalc.symmetric_difference(trees[i],trees[j]) euc_matrix[i][j]=euc_matrix[j][i]=dendropy.treecalc.euclidean_distance(trees[i],trees[j]) #Normalise if specified (normalise here means subtract minimum value and divide by maximum to place each measurement in the range [0,1]) if normalise: rf_matrix = rf_matrix / np.max(rf_matrix) sym_matrix = sym_matrix / np.max(sym_matrix) euc_matrix = euc_matrix / np.max(euc_matrix) linkages = ['single','complete','average','weighted','ward'] matrices = [rf_matrix, sym_matrix, euc_matrix] matrix_names = ['rf','sym', 'euc'] for x in range(len(linkages)): for y in range(len(matrices)): filename = "{0}{1}_{2}_{3}.pdf".format(INPUT_DIR,save_prefix,linkages[x],matrix_names[y]) try: Y = squareform(matrices[y]) link = linkage(Y, linkages[x]) except: Y = matrices[y] link = linkage(Y, linkages[x]) cut = (link[-1][2])*cut_proportion T = fcluster(link,cut,criterion="distance") dendrogram( link, color_threshold=cut, leaf_font_size=font_size, leaf_rotation=90,leaf_label_func=lambda leaf: tree_files[leaf][1+tree_files[leaf].rindex('/'):tree_files[leaf].rindex('.')]+"_"+str(T[leaf]),count_sort=True) title("{0} linkage of {1} matrix".format(linkages[x],matrix_names[y])) axhline(cut,color='grey',ls='dashed') xlabel('Gene') ylabel('Distance') savefig(filename,format='pdf',dpi=1600) clf()
# for f in range(0, len(features)): # if features[f]['property'] in t: # if features[f]['type'] == 'numeric': # pass # elif features[f]['type'] == 'discrete': # if t[features[f]['property']]['value'] == features[f]['value']: # resource_features[snum][f] = float(1.0) print "Found %d distinct resources" % len(resources) rows = None time.sleep(10) print "Computing distances" distances = squareform(pdist(resource_features)) sorted_distance_args = numpy.argsort(distances) print "Writing arff with id %s" % source_id fout = open("%s.arff" % source_id, "w") fout.write("%% Similar resources generated by %s\n" % __file__) fout.write("%% Date: %s\n" % datetime.datetime.today().isoformat()) fout.write("%% Source dataset: %s\n" % opts.dataset) for query_orig in queries: fout.write("%% Query: %s\n" % query_orig) fout.write("%% Found %d distinct resources\n" % len(resources)) if opts.weights: fout.write("%% Weights: %s\n" % ", ".join(opts.weights)) fout.write("@DATA") for r in resources:
method = 'single' data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]]) y = pdist(data, metric=metric) Z = linkage(y, method=method, metric=metric) dendrogram(Z) Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z] # cleaning leaves = list(leaves_list(Z)) count = len(leaves) root = len(Z) + count - 1 X = squareform(y) assert len(X) == count from utils import memoise # bar-joseph optimal ordering ################################################ from barjoseph import optimal leaves = optimal( root, **{ "S": lambda i, j: X[i][j], "left": lambda i: None if i < count else Z[i - count][0], "right": lambda i: None if i < count else Z[i - count][1], "is_leaf": lambda i: i < count, "is_empty": lambda v: v is None,
import hcluster import matplotlib.pyplot as plt import pickle import urllib url = "http://examples.obspy.org/dissimilarities.pkl" dissimilarity = pickle.load(urllib.urlopen(url)) plt.subplot(121) plt.imshow(1 - dissimilarity, interpolation="nearest") dissimilarity = hcluster.squareform(dissimilarity) threshold = 0.3 linkage = hcluster.linkage(dissimilarity, method="single") clusters = hcluster.fcluster(linkage, 0.3, criterion="distance") plt.subplot(122) hcluster.dendrogram(linkage, color_threshold=0.3) plt.xlabel("Event number") plt.ylabel("Dissimilarity") plt.show()
# Compute and plot first dendrogram. fig = plt.figure(figsize=(8, 8)) # x ywidth height ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6]) Y = linkage(data_dist, method='single') Z1 = dendrogram(Y, orientation='right', labels=data.dtype.names) # adding/removing the axes ax1.set_xticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Z2 = dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) #Compute and plot the heatmap axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = squareform(data_dist) D = D[idx1, :] D = D[:, idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6]) plt.colorbar(im, cax=axcolor) fig.savefig('../../results/heatmap.png')
# Compute and plot first dendrogram. fig = plt.figure(figsize=(8,8)) # x ywidth height ax1 = fig.add_axes([0.05,0.1,0.2,0.6]) Y = linkage(data_dist, method='single') Z1 = dendrogram(Y, orientation='right',labels=data.dtype.names) # adding/removing the axes ax1.set_xticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Z2 = dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) #Compute and plot the heatmap axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = squareform(data_dist) D = D[idx1,:] D = D[:,idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) plt.colorbar(im, cax=axcolor) fig.savefig('../../results/heatmap.png')