def time_subcluster(self, locs): # Getting subclusters at Mapzen's limit cluster_linkage = linkage(locs, method='ward') clusters = fcluster(cluster_linkage, 50, criterion='maxclust') cluster_means = np.array([np.mean( locs[np.where(clusters == i)], axis=0 ) for i in range(1, 51)]) mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means] mapzen_matrix = self.mapzen_matrix(mapzen_locs) # Cluster labels used for mapping back together # Subtracting one to use 0 index cl = clusters - 1 # Get a matching distance matrix of lat/lon distance, get ratios cluster_km_dist = squareform(pdist(cluster_means, (lambda u,v: haversine(u,v)))) dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix, cluster_km_dist)) # Divide items by mean to normalize a bit dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix, dist_ratio_matrix.mean())) locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v)))) # Iterate through each, updating by ratio in dist_ratio_matrix it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]] it.iternext() return locs_km_dist
def gethclinks(exparray, method): hcdists = hcluster.pdist(exparray, method) hclinks = hcluster.linkage(hcdists) links = [] for hclink in hclinks: links.append([int(hclink[0]), int(hclink[1])]) return links
def c_dists(Y,use_swt=True,level_weights=False): w = pywt.Wavelet('sym2') if use_swt: L = pywt.swt_max_level(Y.shape[0]) C = [pywt.swt(Y[:,i],w,level=L) for i in range(Y.shape[1])] C = [[list(reshape(l[0],-1)) + list(reshape(l[1],-1)) for l in c] for c in C] else: L = pywt.dwt_max_level(Y.shape[0],w) C = [pywt.wavedec(Y[:,i],w,level=L) for i in range(Y.shape[1])] if level_weights: if use_swt: raise NameError('No level weights with SWT') Wc = [1. for x in range(1,L+1)] D = zeros((len(C),len(C))) for i in range(len(C)): for j in range(i+1,len(C)): d = sum([distance.cosine(C[i][x],C[j][x])*Wc[x] for x in range(L)])/sum(Wc) D[i,j] = d D[j,i] = d return D else: Cn = [] for c in C: cn = [] for l in c: cn += list(l) Cn.append(cn) return abs(pdist(Cn,'cosine'))
def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw): Y = pdist(X,metric) Z = linkage(Y,combine) if showdendro: dendrogram(Z,leaf_label_func=leaf_label_func,**kw) show() return Z
def main(): print "hola" X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z)
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw): Y = pdist(X, metric) Z = linkage(Y, combine) if showdendro: dendrogram(Z, leaf_label_func=leaf_label_func, **kw) show() return Z
def test_pdist(repeat, runs, data): np.random.seed(int(time.time())) clocks = np.empty((repeat, runs)) times = np.empty((repeat, runs)) for i in xrange(repeat): for j in xrange(runs): t1 = time.time() c1 = time.clock() dist_m = hcluster.pdist(data) c2 = time.clock() t2 = time.time() dt = t2 - t1 dc = c2 - c1 clocks[i, j] = c2 - c1 times[i, j] = t2 - t1 del dist_m mean_clock = np.mean(clocks) std_clock = np.std(clocks) mean_time = np.mean(times) std_time = np.std(times) print '%d objects, %d features: clocks=%f +- %f, times=%f +- %f' % (data.shape[0], data.shape[1], mean_clock, std_clock, mean_time, std_time) return mean_time, std_time, mean_clock, std_clock
def test(): word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ] cons_words = ['C', 'B'] X = rand(15, 2) #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]] print X Y = pdist(X) print Y Z = linkage(Y) R = dendrogram(Z) index1 = word_list.index(cons_words[0]) assert index1 >= 0 path1 = findPath(Z, index1, len(word_list)) index2 = word_list.index(cons_words[1]) assert index2 >= 0 path2 = findPath(Z, index2, len(word_list)) print Z print path1 print path2 common = set(path1).intersection(set(path2)) first = min(common) assert(first >= len(word_list)) first -= len(word_list) cluster_root = Z[first][0] merge1 = findCluster(Z, cluster_root, word_list) cluster_root = Z[first][1] merge2 = findCluster(Z, cluster_root, word_list) print merge1 print merge2
def ClusteringWithC_Index(Data,NumberOfClusters,NumberofIterationsForCindex,DistanceBetweenAllPairNodesSorted,DistanceMethod='euclidean'): NumberOfClusters=NumberOfClusters x=Data NumberofIterationsForCindex=NumberofIterationsForCindex NUmberOfNodesInTheClusters=0 D=DistanceBetweenAllPairNodesSorted OptimalCenter=[] C=1 Old_C=sys.maxint Scl=0 N=0 Smin=0 Smax=0 for NumberofIterations in xrange(NumberofIterationsForCindex): centroid,labels=Classifier=kmeans2(Data, NumberOfClusters, iter=500, thresh=1e-05, minit='random', missing='warn') for i in xrange( NumberOfClusters ): NUmberOfNodesInTheClusters=len(x[numpy.where(labels==i)]) Scl=Scl+numpy.sum( hcluster.pdist(x[numpy.where(labels==i)], DistanceMethod)) N=N+Combination(NUmberOfNodesInTheClusters, 2) Smin=numpy.sum( D[0:N:1]) Smax=numpy.sum(D[len(D)-N::1]) C=(Scl-Smin)/(Smax-Smin) Scl=0 N=0 Smin=0 Smax=0 if(C<Old_C): Old_C=C OptimalCenter=centroid[:] return OptimalCenter,Old_C
def c_dists(Y, use_swt=True, level_weights=False): w = pywt.Wavelet('sym2') if use_swt: L = pywt.swt_max_level(Y.shape[0]) C = [pywt.swt(Y[:, i], w, level=L) for i in range(Y.shape[1])] C = [[list(reshape(l[0], -1)) + list(reshape(l[1], -1)) for l in c] for c in C] else: L = pywt.dwt_max_level(Y.shape[0], w) C = [pywt.wavedec(Y[:, i], w, level=L) for i in range(Y.shape[1])] if level_weights: if use_swt: raise NameError('No level weights with SWT') Wc = [1. for x in range(1, L + 1)] D = zeros((len(C), len(C))) for i in range(len(C)): for j in range(i + 1, len(C)): d = sum([ distance.cosine(C[i][x], C[j][x]) * Wc[x] for x in range(L) ]) / sum(Wc) D[i, j] = d D[j, i] = d return D else: Cn = [] for c in C: cn = [] for l in c: cn += list(l) Cn.append(cn) return abs(pdist(Cn, 'cosine'))
def run(self): print 'hello world' features = self.getRandomFeatures() dist = hcluster.pdist(features) print len(dist) self.drawDendrogram(dist)
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'): # Dataset is a mxn matrix, m is number of item and n is the dimension of data m,n=Dataset.shape Visited=numpy.zeros(m,'int') Type=numpy.zeros(m) # -1 noise, outlier # 0 border # 1 core ClustersList=[] Cluster=[] PointClusterNumber=numpy.zeros(m) PointClusterNumberIndex=1 PointNeighbors=[] DistanceMatrix = hcluster.squareform(hcluster.pdist(Dataset, DistanceMethod)) for i in xrange(m): if Visited[i]==0: Visited[i]=1 PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0] if len(PointNeighbors)<MinumumPoints: Type[i]=-1 else: for k in xrange(len(Cluster)): Cluster.pop() Cluster.append(i) PointClusterNumber[i]=PointClusterNumberIndex PointNeighbors=set2List(PointNeighbors) ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex ) Cluster.append(PointNeighbors[:]) ClustersList.append(Cluster[:]) PointClusterNumberIndex=PointClusterNumberIndex+1 return PointClusterNumber
def plotSampleDistanceDendrogram(ds): """Plot a sample distance cluster dendrogram using all samples and features of a dataset. :Parameter: ds: Dataset The source dataset. """ # generate map from num labels to literal labels # to put them on the dendrogram leaves lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()]) # compute distance matrix, default is squared euclidean distance dist = clust.pdist(ds.samples) # determine clusters link = clust.linkage(dist, 'complete') # plot dendrogram with literal labels on leaves # this does not work with etch's version of matplotlib (verified for # matplotlib 0.98) clust.dendrogram( link, colorthreshold=0, labels=[lmap[l] for l in ds.labels], # all black link_color_func=lambda x: 'black', distance_sort=False) labels = P.gca().get_xticklabels() # rotate labels P.setp(labels, rotation=90, fontsize=9)
def cluster(): data = json.load(open("./data/clustering-data.json")) vectors = [ufo['vector'] for ufo in data] distances = pdist(vectors) print distances
def time_series_clusters(Y, ct=0.5, return_clusters=False): D = pdist(transpose(Y), 'correlation') D = abs(D) if return_clusters: L = linkage(D, method='single', metric='cosine') C = fcluster(L, ct, criterion='distance') return cluster_sets(C) plot_clusters(D, ct)
def time_series_clusters(Y,ct=0.5,return_clusters=False): D = pdist(transpose(Y),'correlation') D = abs(D) if return_clusters: L = linkage(D,method='single',metric='cosine') C = fcluster(L,ct,criterion='distance') return cluster_sets(C) plot_clusters(D,ct)
def cluster_elut(mat): import hcluster ymat = hcluster.pdist(mat) zmat = hcluster.linkage(ymat) figure() order = hcluster.dendrogram(zmat)['leaves'] clf() imshow(mat[order,:])
def optics(x, k, distMethod = 'euclidean'): if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 try: D = H.squareform(H.pdist(x, distMethod)) distOK = True except: print "squareform or pdist error" distOK = False CD = np.zeros(m) RD = np.ones(m)*1E10 for i in xrange(m): #again you can use the euclid function if you don't want hcluster # d = euclid(x[i],x) # d.sort() # CD[i] = d[k] tempInd = D[i].argsort() tempD = D[i][tempInd] # tempD.sort() #we don't use this function as it changes the reference CD[i] = tempD[k]#**2 order = [] seeds = np.arange(m, dtype = np.int) ind = 0 while len(seeds) != 1: # for seed in seeds: ob = seeds[ind] seedInd = np.where(seeds != ob) seeds = seeds[seedInd] order.append(ob) tempX = np.ones(len(seeds))*CD[ob] tempD = D[ob][seeds]#[seeds] #you can use this function if you don't want to use hcluster #tempD = euclid(x[ob],x[seeds]) temp = np.column_stack((tempX, tempD)) mm = np.max(temp, axis = 1) ii = np.where(RD[seeds]>mm)[0] RD[seeds[ii]] = mm[ii] ind = np.argmin(RD[seeds]) order.append(seeds[0]) RD[0] = 0 #we set this point to 0 as it does not get overwritten return RD, CD, order
def pdist(self, X): import hcluster import pylab Y = hcluster.squareform(hcluster.pdist(array(X), metric=self.metric)) if self.plot: pylab.imshow(Y) pylab.show() yield Y
def t_dendrogram(X, nclusters): from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand # X = X[:10, :] Y = pdist(X) Z = linkage(Y) res = dendrogram(Z) show() pass
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress): if ward: progress.update('Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method = clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def generate_dendrogram(root): from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand import matplotlib X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) print Y print Z dendrogram(Z)
def do_clusters(cluster_coords,Labels=None,link_method='single',d=0.2): D = pdist(cluster_coords,'cosine') # SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16 D = abs(D) L = linkage(D,method=link_method,metric='cosine') F = fcluster(L,d,'distance','cosine') C = defaultdict(list) for i in range(len(F)): if Labels: C[F[i]].append(Labels[i]) else: C[F[i]].append(i) return C
def MVU_slack(datafile, dim=3): # takes in a pickled matrix of points - outputs a MVU embedding fp = open(datafile) pts = pickle.load(fp) ans = pickle.load(fp) # latent space coordinates size = len(pts) k = len(ans[0]) # the number of latent dimensions # mean center coordinates m = np.mean(pts, axis=0) pts = pts - m # TODO: move graph cluster algorithm to own file - write in C? # compute the distance matrix and cluster Y = hc.squareform(hc.pdist(pts, 'euclidean')) res = cluster_graph(Y, fnc='k', size=8) x, y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors # generate data to write problem in SPDA format # TODO: add slack variable block indx = [] for (i, j) in zip(x, y): if i <= j: indx.append((i, j)) m = len(indx) + 1 nblocks = 2 c = [0.0] for (i, j) in indx: c.append(Y[i, j]**2) write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01) # TODO: add some error checking os.system("csdp ../ds/sdp.dat ../ds/sdp.sol") y, Z, X = read_sol_file_slack("../ds/sdp.sol", size) # spectral decomposition of the dual solution (X) u, s, v = la.svd(X) results = [] for i in range(dim): results.append(np.sqrt(s[i]) * u[:, i]) # returns the neighborhood graph for proper plotting return results, pts, res
def optics(x, k, distMethod = 'euclidean'): if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 try: D = H.squareform(H.pdist(x, distMethod)) distOK = True except Exception, ex: print ex print "squareform or pdist error" distOK = False
def printSummary(updatedtfidfMatrix, queriedSentences): print "\n" a = pdist(updatedtfidfMatrix,'cosine') print a b = linkage(a) dendrogram(b) show() print b sumOrder = [] count = 0 f = open("foo.txt", "w") for i in range(len(b)): x = int(b[i][0]) y = int(b[i][1]) if x <= (len(queriedSentences)-1): sumOrder.append(x) if y <= (len(queriedSentences)-1): sumOrder.append(y) if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(y) if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(x) previous = 0 queriedSentences = [sentence.capitalize() for sentence in queriedSentences] for num in sumOrder: if num > (len(queriedSentences)-1): f.write('<br></br>') else: f.write(queriedSentences[num]) f.write('.') f.write(' ') f.close() with open ("foo.txt", "r") as myfile: #print myfile data=myfile.read() print data return data
def MVU_slack(datafile, dim = 3): # takes in a pickled matrix of points - outputs a MVU embedding fp = open(datafile) pts = pickle.load(fp) ans = pickle.load(fp) # latent space coordinates size = len(pts) k = len(ans[0]) # the number of latent dimensions # mean center coordinates m = np.mean(pts, axis=0) pts = pts - m # TODO: move graph cluster algorithm to own file - write in C? # compute the distance matrix and cluster Y = hc.squareform(hc.pdist(pts,'euclidean')) res = cluster_graph(Y, fnc = 'k', size = 8) x,y = np.nonzero(res & (Y != 0)) # indices of nearest neighbors # generate data to write problem in SPDA format # TODO: add slack variable block indx = [] for (i,j) in zip(x,y): if i <= j: indx.append((i,j)) m = len(indx) + 1 nblocks = 2 c = [0.0] for (i,j) in indx: c.append(Y[i,j]**2) write_spda_file_slack("../ds/sdp.dat", m, nblocks, size, c, indx, .01) # TODO: add some error checking os.system("csdp ../ds/sdp.dat ../ds/sdp.sol") y,Z,X = read_sol_file_slack("../ds/sdp.sol", size) # spectral decomposition of the dual solution (X) u,s,v = la.svd(X) results = [] for i in range(dim): results.append(np.sqrt(s[i]) * u[:,i]) # returns the neighborhood graph for proper plotting return results, pts, res
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8): D = pdist(cluster_coords,'cosine') # SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16 D = abs(D) L = linkage(D,method=link_method,metric='cosine') if Labels: dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh) else: dendrogram(L,orientation='left',color_threshold=color_thresh) pylab.title('HMP Buccal Mucosa - Latent Strain Analysis') pylab.xlabel('Cosine Distance') pylab.ylabel('Strain with the Most Alignments to Each Cluster') pylab.rcParams.update({'font.size': fontsize}) pylab.show()
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''): num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for X_L in X_L_list: assignments = X_L['column_partition']['assignments'] for i in range(num_cols): for j in range(num_cols): if assignments[i] == assignments[j]: z_matrix[i, j] += 1 z_matrix /= float(num_latent_states) # hierachically cluster z_matrix Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] # actually create figure fig = pylab.figure() fig.set_size_inches(16, 12) pylab.imshow(z_matrix_reordered, interpolation='none', cmap=pylab.matplotlib.cm.Greens) pylab.colorbar() if num_cols < 14: pylab.gca().set_yticks(range(num_cols)) pylab.gca().set_yticklabels(column_names_reordered, size='x-small') pylab.gca().set_xticks(range(num_cols)) pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small') else: pylab.gca().set_yticks(range(num_cols)[::2]) pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small') pylab.gca().set_xticks(range(num_cols)[1::2]) pylab.gca().set_xticklabels(column_names_reordered[1::2], rotation=90, size='small') pylab.title('column dependencies for: %s' % tablename) pylab.savefig(filename)
def OnLeftDClick(self, event): #def OnLeftDClick(event): """ Left Double Click has been invocked. This plugin call pdist function from hcluster package and plot the dendrogram using matplotlib.pyplot package. """ #canvas = event.GetEventObject() #model = canvas.getCurrentShape(event) devs = self.getDEVSModel() if devs: Y = pdist(devs.vectors) Z = linkage(Y) dendrogram(Z) show() else: wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
def cluster_path_times(self, path_times,display): recordings = path_times.recordings X=[] for recording in recordings: X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute]) print X Y=pdist(X) Z=linkage(Y) dendrogram(Z) for i in range(len(X)): print('{0}, {1}'.format(i,X[i])) print Z print self.calculate_variances(X,Z) if display: show()
def hierarchical(self,lst,fulldataset): #Samples are colored according to its sample type # label_color={} for i in self.numbering(self.classLabel(lst)): r=('r') b=('b') if i[0:6]=='cancer': label_color[i]=r #print label_colors elif i[0:6]=='normal' : label_color[i]=b #print label_colors else: continue tg=zip(*fulldataset) Y = pdist(tg) #average linkage is applied # Z = linkage(Y,method='average') sch.set_link_color_palette(['black']) a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist) #dendrogram is plotted # ax = plt.gca() xlbls = ax.get_xmajorticklabels() for lbl in xlbls: lbl.set_color(label_color[lbl.get_text()]) plt.title("Average Hierarchical Clustering Algorithm") plt.savefig('Average Hierarchical Clustering.pdf',dpi=500) #plt.show() plt.close() self.labels=array([]) c=array([1]) n=array([0]) #Silhouette Test # #Samples are converted into '0' or '1' for validation # for i in self.classLabel(lst): if i=='cancer': self.labels=np.concatenate([self.labels,c]) else: self.labels=np.concatenate([self.labels,n]) self.labels=np.delete(self.labels,self.labels[-1]) self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
def oldClusteringWithC_Index(Data,NumberOfClusters,NumberofIterationsForCindex,DistanceBetweenAllPairNodesSorted,DistanceMethod='euclidean'): NumberOfClusters=NumberOfClusters x=Data NumberofIterationsForCindex=NumberofIterationsForCindex NUmberOfNodesInTheClusters=0 D=DistanceBetweenAllPairNodesSorted OptimalCenter=[] C=1 Old_C=sys.maxint Scl=0 N=0 Smin=0 Smax=0 for NumberofIterations in xrange(NumberofIterationsForCindex): #init : {'k-means++', 'random', 'points','matrix'} #'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence # http://scikit-learn.sourceforge.net/modules/generated/scikits.learn.cluster.KMeans.html#scikits.learn.cluster.KMeans classifier=cluster.KMeans(k=NumberOfClusters, init='random', n_init=10, max_iter=300, tol=0.0001, verbose=0, random_state=None, copy_x=True) y=classifier.fit(x) for i in xrange( NumberOfClusters ): # print 'NumberofIterations' # print NumberofIterations # print 'NumberOfClusters' # print NumberOfClusters # print 'classifier.cluster_centers_' # print classifier.cluster_centers_ NUmberOfNodesInTheClusters=len(x[numpy.where(classifier.labels_==i)]) Scl=Scl+numpy.sum( hcluster.pdist(x[numpy.where(classifier.labels_==i)], DistanceMethod)) N=N+Combination(NUmberOfNodesInTheClusters, 2) Smin=numpy.sum( D[0:N:1]) Smax=numpy.sum(D[len(D)-N::1]) C=(Scl-Smin)/(Smax-Smin) Scl=0 N=0 Smin=0 Smax=0 if(C<Old_C): Old_C=C OptimalCenter=classifier.cluster_centers_[:] return OptimalCenter,Old_C
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True, norm_rows=True, bigarr=None, **kwargs): import plotting as pl import hcluster arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts, sp, norm_rows=norm_rows)) ymat = hcluster.pdist(arr, metric=dist) zmat = hcluster.linkage(ymat) zmat = np.clip(zmat, 0, 10**8) if do_plot: pl.figure() order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), **kwargs)['leaves'] if do_plot: ax = pl.gca() ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order]) pl.figure() pl.imshow(arr[order,:]) return list(np.array(list(gids))[order])
def DrawDendrogram(feature_vector, obj_names, motion_name): distances = pdist(feature_vector) linkage_list = ['single', 'average', 'complete'] Z = linkage(distances, linkage_list[1]) render = hierarchy.dendrogram(Z, #p=51, #truncate_mode='level', #show_contracted=True, color_threshold=1.5, labels=obj_names, orientation='left', show_leaf_counts=True, leaf_font_size=10, ) plt.title(motion_name+'_'+linkage_list[1]) plt.show() #plt.savefig(motion_name+'_dendro_complete.png') return render
def get_clustering_as_tree(vectors, ward=True, clustering_distance='euclidean', clustering_method='complete', progress=progress): if ward: progress.update( 'Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method=clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def dendrogramBuild(tfidfMatrix,queriedSentences,degree): a = pdist(tfidfMatrix,'cosine') print a b = linkage(a) print b if b[0][2] < degree: mag1 = tfidf.magnitude(tfidfMatrix[int(b[0][0])]) mag2 = tfidf.magnitude(tfidfMatrix[int(b[0][1])]) if mag1 > mag2: print int(b[0][1]) tfidfMatrix.pop(int(b[0][1])) queriedSentences.pop(int(b[0][1])) else: print int(b[0][0]) tfidfMatrix.pop(int(b[0][0])) queriedSentences.pop(int(b[0][0])) dendrogramBuild(tfidfMatrix,queriedSentences,degree) return (tfidfMatrix,queriedSentences)
def hierarchical_clusters( log, show_plot=None ): """Translates traces to Parikh vectors and computes in the vector space a hierarchical clustering.""" def get_parikh(case,alphabet): v = zeros(len(alphabet),dtype=int) for act in case: v[alphabet[act]] = v[alphabet[act]] +1 # canonical representation m = min(v) return v - m actsind = {} i = 0 for act in log.get_alphabet(): actsind[act] = i i = i +1 uniq_cases = log.get_uniq_cases() N = len(uniq_cases) M = len(actsind) data = zeros((N,M),dtype=int) i = 0 parikhdict = {} for case in uniq_cases.keys(): data[i] = get_parikh(case,actsind) str_i = ','.join(map(str,data[i])) if str_i not in parikhdict: parikhdict[str_i] = [i] else: parikhdict[str_i].append(i) i = i + 1 df = DataFrame(data) data_uniq = df.drop_duplicates() Y = pdist(data_uniq,metric='euclidean') Z = linkage(Y,method='average') dendrogram(Z) show()
data = pickle.load(open(path.join(result_path, p['data_label'], 'data.pickle'))) for key, val in data.iteritems(): # for bla in [1]: # key, val = 'eagle', data['eagle'] fig = plt.figure() fig.canvas.mpl_connect('pick_event', onpick) plt.subplot(3, 1, 1) plt.title(key) proj = np.dot(val['U'][:, 0:2].T, val['vecs']) Y = pdist(proj.T) Z = linkage(Y) dendrogram(Z) ax = plt.subplot(3, 1, 2) for i in range(proj.shape[1]): col = (1 - (val['ratings'][i] / 100.0)) * 0.7 pt, = ax.plot(proj[0, i], proj[1, i], '.', color=('%f' % col), picker=3) ax.text(proj[0, i], proj[1, i], i) pt.name = val['keys'][i] plt.subplot(3, 1, 3)
from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand X = rand(10, 100) X[0:5, :] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z) show()
def feature_extraction_torso_camera(input_torso, input_camera): numero_juntas = 15 # number of joints frame_rate = 1 / 30.0 # frame rate window = 10 # temporal window x = input_torso[:, 0::6] y = input_torso[:, 1::6] z = input_torso[:, 2::6] # Guarantees that the number of frames is the same for torso and camera features if input_torso.size < input_camera.size: [m, n] = input_torso.shape else: [m, n] = input_camera.shape ## Log-Cov of distances between every joints relative to the torso distancias = np.zeros((numero_juntas, numero_juntas)) distancias_total = np.array([[]]) for frame in range(0, m): for i in range(0, 15): for j in range(0, 15): distancias[i, j] = mat.pdist( [[x[frame, i], y[frame, i], z[frame, i]], [x[frame, j], y[frame, j], z[frame, j]]]) distlower = np.tril(distancias) distupper = np.triu(distancias) distancias_final = distlower[1:, :] + distupper[ 0:-1, :] # elimination of null diagonal cov_distancias = np.cov(distancias_final.T) #cov_distancias_final = np.triu(cov_distancias) #aux = np.reshape(cov_distancias_final.T,(1,15*15)).copy() #aux=np.array([aux[aux!=0]]) aux = apply_log_vect(cov_distancias) #aux2 = np.reshape(distlower.T,(1,np.size(distlower))).copy() #aux2 = np.array([aux2[aux2!=0]]) distancias_total = np.concatenate([distancias_total, aux ]) if distancias_total.size else aux ## Distances between every joints and torso distancias = np.zeros((m, numero_juntas)) for frame in range(0, m): for i in range(0, 15): distancias[frame, i] = mat.pdist([[x[frame, i], y[frame, i], z[frame, i]], [x[frame, 3], y[frame, 3], z[frame, 3]]]) distancias_ao_torso = distancias ## Absolute velocities velocidades = np.zeros((m, numero_juntas)) for frame in range(0, m): if frame == 0: anterior = frame else: anterior = frame - 1 actual = frame for i in range(0, 15): velocidades[frame, i] = (mat.pdist([[ x[actual, i], y[actual, i], z[actual, i] ], [x[anterior, i], y[anterior, i], z[anterior, i]] ])) / (frame_rate) velocidades_total = velocidades ## Velocities and directions for each dimension {x,y,z} vx = np.zeros((m, numero_juntas)) vy = np.zeros((m, numero_juntas)) vz = np.zeros((m, numero_juntas)) dx = np.zeros((m, numero_juntas)) dy = np.zeros((m, numero_juntas)) dz = np.zeros((m, numero_juntas)) for frame in range(0, m): if frame == 0: anterior = frame else: anterior = frame - 1 actual = frame for i in range(0, 15): dx[frame, i] = x[actual, i] - x[anterior, i] dy[frame, i] = y[actual, i] - y[anterior, i] dz[frame, i] = z[actual, i] - z[anterior, i] vx[frame, i] = dx[frame, i] / (frame_rate) vy[frame, i] = dy[frame, i] / (frame_rate) vz[frame, i] = dz[frame, i] / (frame_rate) velocidade_xyz = np.c_[vx, vy, vz] direcao_xyz = np.c_[dx, dy, dz] ## Angles of the triangles formed by {shoulders, elbows, hands}, {shoulders, hips, knees} and {hips, knees, feet} angulos = np.array([]) for frame in range(0, m): ombro_esq_cotovelo_esq = mat.pdist( [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]], [x[frame, 5 - 1], y[frame, 5 - 1], z[frame, 5 - 1]]]) # distance between left shoulder and left elbow ombro_esq_mao_esq = mat.pdist( [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]], [x[frame, 12 - 1], y[frame, 12 - 1], z[frame, 12 - 1]]]) # distance between left shoulder and left hand mao_esq_cotovelo_esq = mat.pdist( [[x[frame, 12 - 1], y[frame, 12 - 1], z[frame, 12 - 1]], [x[frame, 5 - 1], y[frame, 5 - 1], z[frame, 5 - 1]]]) # distance between left hand and left elbow angulo_esq1 = np.arccos( (ombro_esq_cotovelo_esq**2 + mao_esq_cotovelo_esq**2 - ombro_esq_mao_esq**2) / (2 * ombro_esq_cotovelo_esq * mao_esq_cotovelo_esq)) # angle ombro_dir_cotovelo_dir = mat.pdist( [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]], [x[frame, 7 - 1], y[frame, 7 - 1], z[frame, 7 - 1]]]) # distance between right shoulder and right elbow ombro_dir_mao_dir = mat.pdist( [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]], [x[frame, 13 - 1], y[frame, 13 - 1], z[frame, 13 - 1]]]) # distance between right shoulder and right hand mao_dir_cotovelo_dir = mat.pdist( [[x[frame, 13 - 1], y[frame, 13 - 1], z[frame, 13 - 1]], [x[frame, 7 - 1], y[frame, 7 - 1], z[frame, 7 - 1]]]) # distance between right hand and right elbow angulo_dir1 = np.arccos( (ombro_dir_cotovelo_dir**2 + mao_dir_cotovelo_dir**2 - ombro_dir_mao_dir**2) / (2 * ombro_dir_cotovelo_dir * mao_dir_cotovelo_dir)) # angle ombro_esq_anca_esq = mat.pdist( [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]], [x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]]]) # distance between left shoulder and left hip ombro_esq_joelho_esq = mat.pdist( [[x[frame, 4 - 1], y[frame, 4 - 1], z[frame, 4 - 1]], [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]]) # distance between left shoulder and left knee anca_esq_joelho_esq = mat.pdist( [[x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]], [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]]) # distance between left hip and left knee angulo_esq2 = np.arccos( (ombro_esq_anca_esq**2 + anca_esq_joelho_esq**2 - ombro_esq_joelho_esq**2) / (2 * ombro_esq_anca_esq * anca_esq_joelho_esq)) # angle ombro_dir_anca_dir = mat.pdist( [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]], [x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]]]) # distance between right shoulder and right hip ombro_dir_joelho_dir = mat.pdist( [[x[frame, 6 - 1], y[frame, 6 - 1], z[frame, 6 - 1]], [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]]) # distance between right shoulder and right knee anca_dir_joelho_dir = mat.pdist( [[x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]], [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]]) # distance between right hip and right knee angulo_dir2 = np.arccos( (ombro_dir_anca_dir**2 + anca_dir_joelho_dir**2 - ombro_dir_joelho_dir**2) / (2 * ombro_dir_anca_dir * anca_dir_joelho_dir)) # angle pe_esq_anca_esq = mat.pdist( [[x[frame, 14 - 1], y[frame, 14 - 1], z[frame, 14 - 1]], [x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]]]) # distance between left foot and left hip pe_esq_joelho_esq = mat.pdist( [[x[frame, 14 - 1], y[frame, 14 - 1], z[frame, 14 - 1]], [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]]) # distance between left foot and left knee anca_esq_joelho_esq = mat.pdist( [[x[frame, 8 - 1], y[frame, 8 - 1], z[frame, 8 - 1]], [x[frame, 9 - 1], y[frame, 9 - 1], z[frame, 9 - 1]]]) # distance between left hip and left knee angulo_esq3 = np.arccos((pe_esq_joelho_esq**2 + anca_esq_joelho_esq**2 - pe_esq_anca_esq**2) / (2 * pe_esq_joelho_esq * anca_esq_joelho_esq)) # angle pe_dir_anca_dir = mat.pdist( [[x[frame, 15 - 1], y[frame, 15 - 1], z[frame, 15 - 1]], [x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]]]) # distance between right foot and right hip pe_dir_joelho_dir = mat.pdist( [[x[frame, 15 - 1], y[frame, 15 - 1], z[frame, 15 - 1]], [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]]) # distance between right foot and right knee anca_dir_joelho_dir = mat.pdist( [[x[frame, 10 - 1], y[frame, 10 - 1], z[frame, 10 - 1]], [x[frame, 11 - 1], y[frame, 11 - 1], z[frame, 11 - 1]]]) # distance between right hip and right knee angulo_dir3 = np.arccos((pe_dir_joelho_dir**2 + anca_dir_joelho_dir**2 - pe_dir_anca_dir**2) / (2 * pe_dir_joelho_dir * anca_dir_joelho_dir)) # angle an = np.c_[angulo_esq1, angulo_dir1, angulo_esq2, angulo_dir2, angulo_esq3, angulo_dir3] angulos = np.r_[angulos, an] if angulos.size else an ## Angular Difference variacao_angulos = np.array([[]]) for frame in range(0, m): if frame == 0: anterior = frame else: anterior = frame - 1 actual = frame dif = np.array([angulos[actual, :] - angulos[anterior, :]]) variacao_angulos = np.r_[variacao_angulos, dif] if variacao_angulos.size else dif ## Variation of all joints relative to the camera in {x,y,z} x_camera = input_camera[:, 0::6] y_camera = input_camera[:, 1::6] z_camera = input_camera[:, 2::6] dx_camera = np.zeros((m, numero_juntas)) dy_camera = np.zeros((m, numero_juntas)) dz_camera = np.zeros((m, numero_juntas)) vx_camera = np.zeros((m, numero_juntas)) vy_camera = np.zeros((m, numero_juntas)) vz_camera = np.zeros((m, numero_juntas)) for frame in range(0, m): if frame == 0: anterior = frame else: anterior = frame - 1 actual = frame for i in range(0, 15): dx_camera[frame, i] = x_camera[actual, i] - x_camera[anterior, i] dy_camera[frame, i] = y_camera[actual, i] - y_camera[anterior, i] dz_camera[frame, i] = z_camera[actual, i] - z_camera[anterior, i] vx_camera[frame, i] = dx_camera[frame, i] / (frame_rate) vy_camera[frame, i] = dy_camera[frame, i] / (frame_rate) vz_camera[frame, i] = dz_camera[frame, i] / (frame_rate) variacao_xyz_camera = np.c_[dx_camera, dy_camera, dz_camera] velocidade_xyz_camera = np.c_[vx_camera, vy_camera, vz_camera] # Absolute velocities relative to the camera velocidades = np.zeros((m, numero_juntas)) for frame in range(0, m): if frame == 0: anterior = frame else: anterior = frame - 1 actual = frame for i in range(0, 15): velocidades[frame, i] = (mat.pdist([[ x_camera[actual, i], y_camera[actual, i], z_camera[actual, i] ], [ x_camera[anterior, i], y_camera[anterior, i], z_camera[anterior, i] ]])) / (frame_rate) velocidades_total_camera = velocidades return [ distancias_total, distancias_ao_torso, velocidades_total, velocidade_xyz, direcao_xyz, angulos, variacao_angulos, variacao_xyz_camera, velocidade_xyz_camera, velocidades_total_camera ]
import numpy as np import matplotlib.pyplot as plt from hcluster import pdist, linkage, dendrogram, squareform # same as import them from scipy data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab", names=True, usecols=tuple(range(1, 30)), dtype=float, delimiter="\t") data_array = data.view((np.float, len(data.dtype.names))) data_array = data_array[1:1000].transpose() data_dist = pdist(data_array) # computing the distance data_link = linkage(data_dist) # computing the linkage # just plot the dendrogram. dendrogram(data_link, labels=data.dtype.names) plt.savefig('../../results/dendrogram.png') # or plot the heatmap too! # Compute and plot first dendrogram. fig = plt.figure(figsize=(8, 8)) # x ywidth height ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6]) Y = linkage(data_dist, method='single') Z1 = dendrogram(Y, orientation='right', labels=data.dtype.names) # adding/removing the axes ax1.set_xticks([])
from hcluster import pdist, linkage, leaves_list, squareform, dendrogram import numpy as np import matplotlib as mp metric = 'euclidean' method = 'single' data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]]) y = pdist(data, metric=metric) Z = linkage(y, method=method, metric=metric) dendrogram(Z) Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z] # cleaning leaves = list(leaves_list(Z)) count = len(leaves) root = len(Z) + count - 1 X = squareform(y) assert len(X) == count from utils import memoise # bar-joseph optimal ordering ################################################ from barjoseph import optimal leaves = optimal( root, **{
def dbscan(x,k,Eps = None, distMethod = 'euclidean'): ''' Calculate the density based clustering of an array ''' try: m = x.shape[0] if Eps == None: Eps = epsilon(x,k) #need to test if the squareform will fail #squareform makes a large matrix and if the arrays #input are too large not enough memory exists try: dist = H.squareform(H.pdist(x, distMethod)) distOK = True except: distOK = False x = N.column_stack((N.arange(0,m),x)) if len(x.shape)>1: m,n = x.shape else: m = x.shape[0] n == 1 type = N.zeros(m) touched = N.zeros(m) no = 1 tType = N.zeros(m) cClass = N.zeros(m) if distOK: for i in xrange(m): if touched[i] == 0: ob = x[i] D = dist[ob[0]] # D = euclid(ob[1:],x[:,1:3]) ind = N.where(D<=Eps) ind = set2List(ind)[0] if len(ind)>1 and len(ind)<(k+1): tType[i] = 0 cClass[i] = 0 if len(ind) == 1: tType[i] = -1 cClass[i] = -1 touched[i] = 1 if len(ind) >= k+1: tType[i] = 1 cClass[ind] = N.ones(len(ind))*no for l in ind: ob2 = x[l] touched[l]=1 D2 = dist[ob2[0]] i1 = N.where(D2<=Eps) i1 = set2List(i1)[0] if len(i1) > 1: cClass[i1] = no if len(i1)>=k+1: tType[ob2[0]] = 1 else: tType[ob2[0]] = 0 for j in xrange(len(i1)): if touched[i1[j]] == 0: touched[i1[j]]=1 ind.append(i1[j]) cClass[i1[j]] = no no+=1 else:#this is the very slow way but gets around the memory problem. print "The Input Array is too big and a squareform cannot be computed" raise "MemoryErro" # for i in xrange(m): # if touched[i] == 0: # ob = x[i] # # D = dist[ob[0]] # D = euclid(ob[1:],x[:,1:3]) # ind = N.where(D<=Eps) # ind = set2List(ind)[0] # # if len(ind)>1 and len(ind)<(k+1): # tType[i] = 0 # cClass[i] = 0 # # if len(ind) == 1: # tType[i] = -1 # cClass[i] = -1 # touched[i] = 1 # # if len(ind) >= k+1: # tType[i] = 1 # cClass[ind] = N.ones(len(ind))*no # # for l in ind: # ob2 = x[l] # touched[l]=1 # D2 = euclid(ob2[1:],x[:,1:3]) ## D2 = dist[ob2[0]] # i1 = N.where(D2<=Eps) # i1 = set2List(i1)[0] # if len(i1) > 1: # cClass[i1] = no # if len(i1)>=k+1: # tType[ob2[0]] = 1 # else: # tType[ob2[0]] = 0 # # for j in xrange(len(i1)): # if touched[i1[j]] == 0: # touched[i1[j]]=1 # ind.append(i1[j]) # cClass[i1[j]] = no # # no+=1 i1 = N.where(cClass == 0) i1 = set2List(i1)[0] cClass[i1] = -1 tType[i1] = -1 return cClass, tType, Eps, True except: errorMsg ="An error occured with the DBSCAN Algorithm\n" errorMsg += "Sorry: %s\n\n%s\n"%(sys.exc_type, sys.exc_value) print errorMsg return None,None,None,False
def Xi_activity_similarity(X, Y): num_different = sum(x != y for x,y in zip(X,Y)) possibly_different = sum(X)+sum(Y) return num_different/possibly_different # corresponds to the binary distance i the R function dist stateD = {'XI':1, 'bi':1, 'nd':0, 'xa':0} if '__main__' == __name__: # load table linefeed = dr_tools.splitlines('chrX_clones_allelic_calls.txt') sample_labels = next(linefeed)[1:] character_matrixT = [] for cells in linefeed: # values in cells are nd, XI, xa, bi, except first column which is gene symbol if any(c!='nd' for c in cells): character_matrixT.append([stateD[c] for c in cells[1:]]) # make clusters character_matrix = numpy.array(character_matrixT).transpose() #hcdists = hcluster.pdist(character_matrix, metric='cityblock') hcdists = hcluster.pdist(character_matrix, metric=Xi_activity_similarity) hclinks = hcluster.linkage(hcdists, method='complete') draw_order = hcluster.leaves_list(hclinks) # draw tree scipyhcluster.dendrogram(hclinks, labels=sample_labels, leaf_rotation=90) pylab.subplots_adjust(bottom=0.3) pylab.savefig('tree_Xiexpr.pdf')
def feature_extraction(entrada): frame_rate=1.0/30.0 # tempo decorrido entre cada frame em segundos window = 10.0 # janela de frames para aspectos temporais x = entrada[:,0::6] y = entrada[:,1::6] z = entrada[:,2::6] [m, n] = np.shape(entrada) ## distancias distancias = np.zeros((14,14)) distancias_total=np.array([]) for frame in range(0,m): for i in range(0,14): for j in range(0,14): distancias[i,j]= mat.pdist([[x[frame,i], y[frame,i], z[frame,i]], [x[frame,j], y[frame,j], z[frame,j]]]) distlower = np.tril(distancias) distupper = np.triu(distancias) distancias_final = distlower[1:, :] + distupper[0:-1,:] # eleminacao da diagonal de zeros #cov_distancias = np.cov(distancias_final) #print np.shape(cov_distancias) #cov_distancias_final = np.triu(cov_distancias) #aux = np.reshape(cov_distancias_final.T,1,15*15) #aux[aux==0]=[] #distancias_total=concatenate((distancias_total, aux)) aux = np.array([get_triu_cov(distancias_final.T)]) distancias_total = np.concatenate([distancias_total, aux]) if distancias_total.size else aux #print np.shape(distancias_total) # velocidades absolutas velocidades=np.zeros((np.floor(m/window),14)) velocidades_total = np.array([]) for frame in range(0,int(np.floor(m/window))): actual = frame*window anterior = frame*(window-9) for i in range(0,14): velocidades[frame,i]= mat.pdist([[x[actual,i], y[actual,i], z[actual,i]], [x[anterior,i], y[anterior,i], z[anterior,i]]])/(frame_rate*window) if frame==int(np.floor(m/window)): velocidades_total = np.concatenate((velocidades_total, np.tile(velocidades[frame,:],(m-(window*frame)+window,1)))) else: velocidades_total = np.concatenate((velocidades_total, np.tile(velocidades[frame,:],(window,1)))) if velocidades_total.size else np.tile(velocidades[frame,:],(window,1)) # velocidades e direcoes relativamente a cada eixo vx = np.zeros((np.floor(m/window),14)) vy = np.zeros((np.floor(m/window),14)) vz = np.zeros((np.floor(m/window),14)) dx = np.zeros((np.floor(m/window),14)) dy = np.zeros((np.floor(m/window),14)) dz = np.zeros((np.floor(m/window),14)) direcao_xyz = np.array([]) velocidade_xyz = np.array([]) for frame in range(0,int(np.floor(m/window))): actual = frame*window anterior = frame*window-9 for i in range(0,14): dx[frame,i] = x[actual,i]-x[anterior,i] dy[frame,i] = y[actual,i]-y[anterior,i] dz[frame,i] = z[actual,i]-z[anterior,i] vx[frame,i] = dx[frame,i]/(frame_rate*window) vy[frame,i] = dy[frame,i]/(frame_rate*window) vz[frame,i] = dz[frame,i]/(frame_rate*window) if frame==np.floor(m/window): aux_v = np.c_[np.tile(vx[frame,:],((m-(window*frame)+window,1))), np.tile(vy[frame,:],((m-(window*frame)+window,1))), np.tile(vz[frame,:],((m-(window*frame)+window,1)))] velocidade_xyz = np.concatenate([velocidade_xyz, aux_v]) if velocidade_xyz.size else aux_v aux_d = np.array([np.c_[np.tile(dx[frame,:],(m-(window*frame)+window,1)), np.tile(dy[frame,:],(m-(window*frame)+window,1)), np.tile(dz[frame,:],((m-(window*frame)+window,1)))]]) direcao_xyz = np.concatenate([direcao_xyz, aux_d]) if direcao_xyz.size else aux_d else: aux_v = np.c_[np.tile(vx[frame,:],(window,1)), np.tile(vy[frame,:],(window,1)), np.tile(vz[frame,:],(window,1))] velocidade_xyz = np.concatenate([velocidade_xyz, aux_v]) if velocidade_xyz.size else aux_v aux_d = np.c_[np.tile(dx[frame,:],(window,1)), np.tile(dy[frame,:],(window,1)), np.tile(dz[frame,:],(window,1))] direcao_xyz = np.concatenate([direcao_xyz, aux_d]) if direcao_xyz.size else aux_d return [distancias_total, velocidades_total, velocidade_xyz, direcao_xyz]
def _do_gen_matrix(self, col_function_name, X_L_list, X_D_list, M_c, T, tablename='', filename=None, col=None, confidence=None, limit=None, submatrix=False): if col_function_name == 'mutual information': col_function = getattr(self, '_mutual_information') elif col_function_name == 'dependence probability': col_function = getattr(self, '_dependence_probability') elif col_function_name == 'correlation': col_function = getattr(self, '_correlation') elif col_function_name == 'view_similarity': col_function = getattr(self, '_view_similarity') else: raise Exception('Invalid column function') num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for i in range(num_cols): for j in range(num_cols): z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T) if col: z_column = list(z_matrix[M_c['name_to_idx'][col]]) data_tuples = zip(z_column, range(num_cols)) data_tuples.sort(reverse=True) if confidence: data_tuples = filter(lambda tup: tup[0] >= float(confidence), data_tuples) if limit and limit != float("inf"): data_tuples = data_tuples[:int(limit)] data = [tuple([d[0] for d in data_tuples])] columns = [d[1] for d in data_tuples] column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) column_names_reordered = column_names[columns] if submatrix: z_matrix = z_matrix[columns, :][:, columns] z_matrix_reordered = z_matrix else: return {'data': data, 'columns': column_names_reordered} else: # hierachically cluster z_matrix import hcluster Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] title = 'Pairwise column %s for %s' % (col_function_name, tablename) if filename: utils.plot_matrix(z_matrix_reordered, column_names_reordered, title, filename) return dict(matrix=z_matrix_reordered, column_names=column_names_reordered, title=title, filename=filename, message="Created " + title)
cosine_similarity(tfidf_matrix,my_ref_len) my_sentences = sentence.extract_sentence("test3.txt") #for sentence in my_sentences: #print "Sentence ------>", sentence for sentence in my_sentences: tfisf_sum = 0 for word in sentence: for word_score in tfidf_scores: if word == word_score: tfisf_sum = tfisf_sum + tfidf_scores[word_score] tfidf_sentence_scores[sentence] = tfisf_sum #for sentence in tfidf_sentence_scores: #tfisf_scores[sentence] = float(tfidf_sentence_scores[sentence]/tfidf_sum(tfidf_scores)) Y = pdist(tfidf_matrix) Z = linkage(Y) dendrogram(Z) show()
N = len(uniq_cases) M = len(actsind) data = zeros((N, M), dtype=int) i = 0 parikhdict = {} for case in uniq_cases.keys(): data[i] = get_parikh(case, actsind) str_i = ','.join(map(str, data[i])) if str_i not in parikhdict: parikhdict[str_i] = [i] else: parikhdict[str_i].append(i) i = i + 1 df = DataFrame(data) data_uniq = df.drop_duplicates() Y = pdist(data_uniq, metric='euclidean') Z = linkage(Y, method='average') dendrogram(Z) show() def similarity_clusters(log, show_plot=None): """Translates traces to Parikh vectors and computes in the vector space a K-means clustering.""" def get_parikh(case, alphabet): v = zeros(len(alphabet), dtype=int) for act in case: v[alphabet[act]] = v[alphabet[act]] + 1 return v actsind = {}
elif o.method == 'numsameCAST': m = castoverlap_numgenes elif o.method == 'numsamemono_norm': m = monoallelic_numgenes_norm elif o.method == 'numsamemono100_norm': m = monoallelic_numgenes_norm_100 elif o.method == 'numsameC57_norm': m = c57overlap_numgenes_norm elif o.method == 'numsameCAST_norm': m = castoverlap_numgenes_norm else: m = o.method # make clusters exparray = character_matrix hcdists = hcluster.pdist(exparray, metric=m) hclinks = hcluster.linkage(hcdists, method=o.linkage) draw_order = hcluster.leaves_list(hclinks) # draw tree scipyhcluster.dendrogram(hclinks, labels=samplenames, leaf_rotation=90) pylab.subplots_adjust(bottom=0.3) pylab.ylabel('%s (linkage=%s)' % (o.method, o.linkage)) if o.method in ('numsamemono', 'numsameC57', 'numsameCAST', 'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm'): pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 100, 200, 300, 400, 500]) elif o.method in ('numsamemono100', 'numsamemono100_norm'): pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 20, 40, 60, 80, 100]) pylab.savefig(o.fig)