def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True): main_axes = plt.gca() divider = make_axes_locatable(main_axes) if sort_x is True: plt.sca(divider.append_axes("top", 0.5, pad=0)) xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric) xdendro = dendrogram(xlinkage, orientation='top', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a[[a.columns[i] for i in xdendro['leaves']]] if sort_y is True: plt.sca(divider.append_axes("left", 1.0, pad=0)) ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric) ydendro = dendrogram(ylinkage, orientation='right', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a.ix[[a.index[i] for i in ydendro['leaves']]] plt.sca(main_axes) plt.imshow(a, aspect='auto', interpolation='none', cmap=cmap, vmin=0.0, vmax=1.0) plt.colorbar(pad=0.15) plt.gca().yaxis.tick_right() plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small') plt.yticks(range(a.shape[0]), a.index, size='x-small') plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') plt.gca().invert_yaxis() plt.show()
def rmsd(ref_cds, est_cds): """ Root-mean-squared-difference """ ref_dists = pdist(ref_cds) est_dists = pdist(est_cds) return np.sqrt(((ref_dists - est_dists)**2).mean())
def compute_distance(): ''' Computes distances between congress members for a particular category and writes out the results in a text file. Web App reads these text files to show graphs. ''' category_map = {1: 'Health Care', 2: 'National Security', 3:'Economy', 4:'Environment', 5:'Domestic Issues' } vm = Voting_Matrix('114') for j in xrange(1,6): votes, member_to_row = vm.generate_matrix(category = [j]) y = pdist(votes, 'cosine') y_dist = squareform(y) normed_distances = np.zeros((len(y_dist), len(y_dist))) for i in xrange(len(y_dist)): min_value = min(y_dist[i,:]) max_value = max(y_dist[i,:]) normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value) np.savetxt("data/%s114Distance.csv" %category_map[j], normed_distances, delimiter=",", fmt='%5.5f') votes, member_to_row = vm.generate_matrix(category = [1,2,3,4,5]) y = pdist(votes, 'cosine') y_dist = squareform(y) normed_distances = np.zeros((len(y_dist), len(y_dist))) for i in xrange(len(y_dist)): min_value = min(y_dist[i,:]) max_value = max(y_dist[i,:]) normed_distances[i,:] = (y_dist[i,:]-min_value) / (max_value-min_value) np.savetxt("data/All Categories114Distance.csv" , normed_distances, delimiter=",", fmt='%5.5f') df = pd.read_csv('../DataCollectionInsertion/Members/114Members.csv') row_nums = np.array([member_to_row[str(df.iloc[i]['person__id'])] for i in xrange(len(df))]) df['row_nums'] = row_nums df.to_csv('../DataCollectionInsertion/Members/114Members.csv', sep=',')
def writePlotMDS(num, nest, seqs, dbfile, mappos, maparr, map2d, outfile, refdb=None, refseqs=None, rg=None): #initialize variables clusters = range(1,len(num)+1) frequency = list(num) #loop through clusters structure = [0 for i in range(len(num))] diversity = [[] for i in range(len(num))] for i in range(len(num)): indices = [j for j, x in enumerate(seqs) if x == nest[i]] db = [dbfile[j] for j in indices] #get cluster structure medoids structs = [j.replace('.','0') for j in db] structs = [j.replace('(','1') for j in structs] structs = [j.replace(')','1') for j in structs] structs = [[int(x) for x in list(j)] for j in structs] dst = pdist(1-np.matrix(structs),'jaccard') dst = np.sum(dst, axis=0) ind = np.argmin(dst) structure[i] = db[ind] #get diversity if refdb is not None: indices = [j for j, x in enumerate(refseqs) if x == nest[i]] db = [refdb[j] for j in indices] db = [x[rg[0]:rg[-1]] for x in db] structs = [j.replace('.','0') for j in db] structs = [j.replace('(','1') for j in structs] structs = [j.replace(')','1') for j in structs] structs = [[int(x) for x in list(j)] for j in structs] if not indices: diversity[i] = [0, 0] else: d = Counter(db) d = sorted(d.items()) n = [x[0] for x in d] #unique structures m = [x[1] for x in d] #frequency divsz = 1 if len(m) > 1: if len(structs) < 2: divsz = 1 if len(structs) < 3: divsz = pdist(1-np.matrix(structs),'jaccard').tolist()[0] else: divsz = min(np.diag(np.matrix(pdist(1-np.matrix(structs),'jaccard')),k=1)) divfreq = max(m)/len(db) diversity[i] = [divsz, divfreq] #write to file with open(outfile+'.csv', 'w') as csvfile: fieldnames = ['cluster', 'xy-coords', 'frequency', 'mediod-structure', 'vectorization'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in range(len(num)): basic_dict = {'cluster': clusters[i], 'xy-coords': np.array_str(mappos[i]), 'frequency': frequency[i], 'mediod-structure': structure[i], 'vectorization': maparr[i],} writer.writerow(basic_dict) return{'structs':structure, 'diversity':diversity}
def stress(ref_cds, est_cds): """ Kruskal's stress """ ref_dists = pdist(ref_cds) est_dists = pdist(est_cds) return np.sqrt(((ref_dists - est_dists)**2).sum() / (ref_dists**2).sum())
def initRTI(nodeLocs, delta_p, sigmax2, delta, excessPathLen): # Set up pixel locations as a grid. personLL = nodeLocs.min(axis=0) personUR = nodeLocs.max(axis=0) pixelCoords, xVals, yVals = calcGridPixelCoords(personLL, personUR, delta_p) pixels = pixelCoords.shape[0] #plt.figure(3) #plotLocs(pixelCoords) # Find distances between pixels and transceivers DistPixels = dist.squareform(dist.pdist(pixelCoords)) DistPixelAndNode = dist.cdist(pixelCoords, nodeLocs) DistNodes = dist.squareform(dist.pdist(nodeLocs)) # Find the (inverse of) the Covariance matrix between pixels CovPixelsInv = linalg.inv(sigmax2*np.exp(-DistPixels/delta)) # Calculate weight matrix for each link. nodes = len(nodeLocs) links = nodes*(nodes-1) W = np.zeros((links, pixels)) for ln in range(links): txNum, rxNum = txRxForLinkNum(ln, nodes) ePL = DistPixelAndNode[:,txNum] + DistPixelAndNode[:,rxNum] - DistNodes[txNum,rxNum] inEllipseInd = np.argwhere(ePL < excessPathLen) pixelsIn = len(inEllipseInd) if pixelsIn > 0: W[ln, inEllipseInd] = 1.0 / float(pixelsIn) # Compute the projection matrix inversion = np.dot(linalg.inv(np.dot(W.T, W) + CovPixelsInv), W.T) return (inversion, xVals, yVals)
def test_pdist(self): for metric, argdict in self.scipy_metrics.iteritems(): keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = pdist(self.X1, metric, **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(metric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, metric, X, dm, D_true, False for rmetric, (metric, func) in self.reduced_metrics.iteritems(): argdict = self.scipy_metrics[metric] keys = argdict.keys() for vals in itertools.product(*argdict.values()): kwargs = dict(zip(keys, vals)) D_true = func(pdist(self.X1, metric, **kwargs), **kwargs) Dsq_true = squareform(D_true) dm = DistanceMetric(rmetric, **kwargs) for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, Dsq_true, True for X in self.X1, self.spX1: yield self.check_pdist, rmetric, X, dm, D_true, False
def optimal_clustering(df, patch, method='kmeans', statistic='gap', max_K=5): if len(patch) == 1: return [patch] if statistic == 'db': if method == 'kmeans': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): kmeans = cluster.KMeans(n_clusters=k).fit(X) clustering[k] = pd.DataFrame(kmeans.predict(X), index=patch) dist_mu = squareform(pdist(kmeans.cluster_centers_)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif method == 'agglomerative': if len(patch) <= 5: K_max = 2 else: K_max = min(len(patch) / 2, max_K) clustering = {} db_index = [] X = df.ix[patch, :] for k in range(2, K_max + 1): agglomerative = cluster.AgglomerativeClustering(n_clusters=k, linkage='average').fit(X) clustering[k] = pd.DataFrame(agglomerative.fit_predict(X), index=patch) tmp = [list(clustering[k][clustering[k][0] == i].index) for i in range(k)] centers = np.array([np.mean(X.ix[c, :], axis=0) for c in tmp]) dist_mu = squareform(pdist(centers)) sigma = [] for i in range(k): points_in_cluster = clustering[k][clustering[k][0] == i].index sigma.append(sqrt(X.ix[points_in_cluster, :].var(axis=0).sum())) db_index.append(davies_bouldin(dist_mu, np.array(sigma))) db_index = np.array(db_index) k_optimal = np.argmin(db_index) + 2 return [list(clustering[k_optimal][clustering[k_optimal][0] == i].index) for i in range(k_optimal)] elif statistic == 'gap': X = np.array(df.ix[patch, :]) if method == 'kmeans': f = cluster.KMeans gaps = gap(X, ks=range(1, min(max_K, len(patch))), method=f) k_optimal = list(gaps).index(max(gaps))+1 clustering = pd.DataFrame(f(n_clusters=k_optimal).fit_predict(X), index=patch) return [list(clustering[clustering[0] == i].index) for i in range(k_optimal)] else: raise 'error: only db and gat statistics are supported'
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False): if gradient: data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0) scale = None metric = 'seuclidean' row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete') else: data_to_plot = data_array.T scale = 0 metric = 'correlation' row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete') assignments = fcluster(row_linkage, n_clusters, criterion='maxclust') cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, yticklabels=gene_names, row_linkage=row_linkage, row_colors=[settings.STATE_COLORS[i] for i in assignments]) r = np.arange(10, data_array.shape[0], data_array.shape[0]/10) plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5) cm.ax_heatmap.set_xticks(r) cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]]) cm.ax_heatmap.set_xlabel('Pseudotime') cm.ax_heatmap.set_ylabel('Gene') gene_clusters = defaultdict(list) for i, cl in enumerate(assignments): gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i]) return gene_clusters
def kcca(self, X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0): n, p = X.shape n, q = Y.shape Kx = DIST.squareform(DIST.pdist(X, kernel_x)) Ky = DIST.squareform(DIST.pdist(Y, kernel_y)) J = np.eye(n) - np.ones((n, n)) / n M = np.dot(np.dot(Kx.T, J), Ky) / n L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky sqx = SLA.sqrtm(SLA.inv(L)) sqy = SLA.sqrtm(SLA.inv(N)) a = np.dot(np.dot(sqx, M), sqy.T) A, s, Bh = SLA.svd(a, full_matrices=False) B = Bh.T # U = np.dot(np.dot(A.T, sqx), X).T # V = np.dot(np.dot(B.T, sqy), Y).T print s.shape print A.shape print B.shape return s, A, B
def getClosestGID( self , rounded): """Calculate the kmer score returns (score, (closestGID, dist), (furthestGID, dist)) """ LGTs = self.lgtGenomes.keys() print LGTs dgs = [] for lgt_id in LGTs: #print lgt_id dg1 = pdist([self.genomeTmers[self.lgtGenomes[lgt_id][0]], self.lgtTmer[lgt_id] ]) dg2 = pdist([self.genomeTmers[self.lgtGenomes[lgt_id][1]], self.lgtTmer[lgt_id] ]) dg1_str = ''.join(map(str,dg1)) dg2_str = ''.join(map(str,dg2)) rounded_score = float(np.round(dg1/(dg1+dg2),decimals=2)) score = float(dg1/(dg1+dg2)) #print rounded_score self.lgtScores[lgt_id] = [score,float(np.mean([dg1,dg2])),dg1_str,dg2_str] print self.lgtScores if rounded: try: self.Dist_dict[rounded_score]+=1 except KeyError: self.Dist_dict[rounded_score]=1 else: self.Dist_dict[score]=[float(np.mean([dg1,dg2])),dg1_str,dg2_str]
def cengci(data): X = data distMatrix = pdist(X) Z = linkage(X, 'ward') c, coph_dists = cophenet(Z, pdist(X)) print c dendrogram(Z)
def distcorr(X, Y, flip=True): """ Compute the distance correlation function >>> a = [1,2,3,4,5] >>> b = np.array([1,2,9,4,4]) >>> distcorr(a, b) 0.762676242417 Taken from: https://gist.github.com/satra/aa3d19a12b74e9ab7941 """ X = np.atleast_1d(X) Y = np.atleast_1d(Y) if np.prod(X.shape) == len(X): X = X[:, None] if np.prod(Y.shape) == len(Y): Y = Y[:, None] X = np.atleast_2d(X) Y = np.atleast_2d(Y) n = X.shape[0] if Y.shape[0] != X.shape[0]: raise ValueError('Number of samples must match') a = squareform(pdist(X)) b = squareform(pdist(Y)) A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean() B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean() dcov2_xy = (A * B).sum()/float(n * n) dcov2_xx = (A * A).sum()/float(n * n) dcov2_yy = (B * B).sum()/float(n * n) dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy)) if flip == True: dcor = 1-dcor return dcor
def K_SE(xs, ys=None, l=1, deriv=False, wrt='l'): l = asarray(l) sig = 1 #l[0] #l = l[1:] xs = ascolumn(xs) if ys is None: d = squareform(pdist(xs/l, 'sqeuclidean')) else: ys = ascolumn(ys) d = cdist(xs/l, ys/l, 'sqeuclidean') cov = exp(-d/2) if not deriv: return sig * cov grads = [] if wrt == 'l': #grads.append(cov) # grad of sig for i in xrange(shape(xs)[1]): if ys is None: grad = sig * cov * squareform(pdist(ascolumn(xs[:,i]), 'sqeuclidean')) else: grad = sig * cov * cdist(ascolumn(xs[:,i]), ascolumn(ys[:,i]), 'sqeuclidean') grad /= l[i] ** 3 grads.append(grad) return sig * cov, grads elif wrt == 'y': if shape(xs)[0] != 1: print '*** x not a row vector ***' jac = sig * cov * ((ys - xs) / l**2).T return sig * cov, jac
def vi_pairwise_matrix(segs, split=False): """Compute the pairwise VI distances within a set of segmentations. If 'split' is set to True, two matrices are returned, one for each direction of the conditional entropy. 0-labeled pixels are ignored. Parameters ---------- segs : iterable of np.ndarray of int A list or iterable of segmentations. All arrays must have the same shape. split : bool, optional Should the split VI be returned, or just the VI itself (default)? Returns ------- vi_sq : np.ndarray of float, shape (len(segs), len(segs)) The distances between segmentations. If `split==False`, this is a symmetric square matrix of distances. Otherwise, the lower triangle of the output matrix is the false split distance, while the upper triangle is the false merge distance. """ d = np.array([s.ravel() for s in segs]) if split: def dmerge(x, y): return split_vi(x, y)[0] def dsplit(x, y): return split_vi(x, y)[1] merges, splits = [squareform(pdist(d, df)) for df in [dmerge, dsplit]] out = merges tri = np.tril(np.ones(splits.shape), -1).astype(bool) out[tri] = splits[tri] else: out = squareform(pdist(d, vi)) return out
def test_PDist(): targets = np.tile(xrange(3),2) chunks = np.repeat(np.array((0,1)),3) ds = dataset_wizard(samples=data, targets=targets, chunks=chunks) data_c = data - np.mean(data,0) # DSM matrix elements should come out as samples of one feature # to be in line with what e.g. a classifier returns -- facilitates # collection in a searchlight ... euc = pdist(data, 'euclidean')[None].T pear = pdist(data, 'correlation')[None].T city = pdist(data, 'cityblock')[None].T center_sq = squareform(pdist(data_c,'correlation')) # Now center each chunk separately dsm1 = PDist() dsm2 = PDist(pairwise_metric='euclidean') dsm3 = PDist(pairwise_metric='cityblock') dsm4 = PDist(center_data=True,square=True) assert_array_almost_equal(dsm1(ds).samples,pear) assert_array_almost_equal(dsm2(ds).samples,euc) dsm_res = dsm3(ds) assert_array_almost_equal(dsm_res.samples,city) # length correspondings to a single triangular matrix assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2) # generate label pairs actually reflect the vectorform generated by # squareform() dsm_res_square = squareform(dsm_res.samples.T[0]) for i, p in enumerate(dsm_res.sa.pairs): assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0]) dsm_res = dsm4(ds) assert_array_almost_equal(dsm_res.samples,center_sq) # sample attributes are carried over assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
def scene_based_double_corr(ds): num_subj = ds.shape[0] num_voxels = ds.shape[1] num_scenes = len(ds.a.event_bounds) ds_list = np.zeros((num_subj, num_voxels, num_scenes-1)) prev_cutoff = 0 # average correlations for each scene for i, scene_cutoff in enumerate(ds.a.event_bounds): ds_list[:,:,i] = np.mean(ds.samples[:,:,prev_cutoff:scene_cutoff], axis=2) prev_cutoff = scene_cutoff self_correlations = [] # convert each subject to a vector of its pairwise correlations between scenes for subj in ds_list: corrs = 1 - pdist(subj.T, metric='correlation') self_correlations.append(corrs) # get all pairwise correlations between subjects correlation = 1 - pdist(self_correlations, metric="correlation") # return the average isc scene based correlation return np.mean(correlation)
def _call(self,dataset): # Get neural sim b/w pairs of targets if self.pairwise_metric == 'correlation': pairsim = dict((pair[0]+'-'+pair[1],pdist([dataset[dataset.sa.targets == pair[0]].samples[0], dataset[dataset.sa.targets == pair[1]].samples[0]],metric=self.pairwise_metric)) for pair in self.pairs) else: pairsim = dict((pair[0]+'-'+pair[1],pdist([dataset[dataset.sa.targets == pair[0]].samples[0], dataset[dataset.sa.targets == pair[1]].samples[0]],metric=self.pairwise_metric)) for pair in self.pairs) return Dataset(np.array([pairsim,]))
def expand_triangular_mesh(c, offset=2, com_bias=(0,0,0)): #find center of mass of current points #adding the bias doesn't really make a big effect unless the bias is very #large which is not what we want u, v, w = np.mean(c, axis=0) + com_bias new_c = [] for pt in c: #coordinates of point a, b, c = pt #distance from point to center, effective coordinates x, y, z = a-u, b-v, c-w #find (rho, theta, phi) theta = np.arctan2(y, x) h = pdist( ((a,b), (u,v)) ) phi = np.arctan2(z, h) rho = pdist( ((a,b,c), (u,v,w)) ) # change rho and call it nu nu = rho + offset # find new effective coordinates of (nu, theta, phi) f = nu*np.sin(phi) g = nu*np.cos(phi) e = g*np.sin(theta) d = g*np.cos(theta) #align new effective coordinates to center of mass new_c.append( (u+d, v+e, w+f) ) return np.squeeze(new_c)
def covMatrix(X, Y, theta, symmetric = True, kernel = lambda u, theta: theta[0]*theta[0]*np.exp(-0.5*u*u/(theta[1]*theta[1])), \ dist_f=None): if len(np.array(X).shape) == 1: _X = np.array([X]).T else: _X = np.array(X) if len(np.array(Y).shape) == 1: _Y = np.array([Y]).T else: _Y = np.array(Y) if dist_f == None: if symmetric: cM = pdist(_X) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y) M = kernel(cM, theta) return M else: if symmetric: cM = pdist(_X, dist_f) M = squareform(cM) M = kernel(M, theta) return M else: cM = cdist(_X, _Y, dist_f) M = kernel(cM, theta) return M return
def compare_clusters(args): ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(ref_df) linkage_ref = linkage(ref_df, 'average') c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df)) outfile = open(args['output'],"w") outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n") for i in args['all']: fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix() check_symmetry(fst_df) mantel_coeff = 0.0 p_value_mantel = 0.0 cophenetic_pearson = 0.0 p_value_cophenetic = 0.0 n = 0 try: mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df) linkage_fst = linkage(fst_df, 'average') c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df)) cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst) except Exception as e: print("Error : %s" % str(e)) mantel_coeff = "Failed" p_value_manel = "Failed" cophenetic_pearson = "Failed" p_value_cophenetic = "Failed" outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n") outfile.close()
def mds_author_term(fname1='corr_2d_mds_authors_by_terms.png', fname2='corr_2d_mds_terms_by_authors.png'): bib_data = get_bib_data() mat, authors, term_list, authors_cnt = get_author_by_term_mat(bib_data, tfreq=5, afreq=10) adist = dist.squareform(dist.pdist(mat, 'correlation')) coords,_ = mds(adist, dim=2) fig = plt.figure() fig.clf() plt.xlim(-15, 20) plt.ylim(-15, 20) for label, x, y in zip(authors, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x*20,y*20)) plt.axis('off') plt.savefig(fname1) mat = mat.T tdist = dist.squareform(dist.pdist(mat, 'correlation')) coords, _ = mds(tdist, dim=2) #fig = plt.figure() fig.clf(); plt.xlim(-80,100) plt.ylim(-100,100) for label, x, y in zip(term_list, coords[:,0], coords[:,1]): plt.annotate(label, xy=(x*500,y*500)) plt.axis('off') plt.savefig(fname2)
def collaspe_fclusters(data=None, t=None, row_labels=None, col_labels=None, linkage='average', pdist='euclidean', standardize=3, log=False): """a function to collaspe flat clusters by averaging the vectors within each flat clusters achieved from hierarchical clustering""" ## preprocess data if log: data = np.log2(data + 1.0) if standardize == 1: # Standardize along the columns of data data = zscore(data, axis=0) elif standardize == 2: # Standardize along the rows of data data = zscore(data, axis=1) if row_labels is not None and col_labels is None: ## only get fclusters for rows d = dist.pdist(data, metric=pdist) axis = 1 ##!!! haven't checked whether this is correct yet elif row_labels is None and col_labels is not None: ## only get fclusters for cols d = dist.pdist(data.T, metric=pdist) axis = 0 D = dist.squareform(d) Y = sch.linkage(D, method=linkage, metric=pdist) fclusters = sch.fcluster(Y, t, 'distance') fcluster_set = set(fclusters) data_cf = [] for fc in fcluster_set: mask = np.where(fclusters==fc) data_t = data.T vector_avg = np.average(data_t[mask],axis=axis) data_cf.append(vector_avg) data_cf = np.array(data_cf).T return data_cf
def similarities(obj): """ Optional: similarities of entities. """ phi = coo_matrix(np.load(str(obj.directory / 'phi.npy'))) theta = coo_matrix(np.load(str(obj.directory / 'theta.npy'))) with CsvWriter(obj.directory, DocumentSimilarity) as out: distances = squareform(pdist(theta.T, 'cosine')) out << (dict(a_id=i, b_id=sim_i, similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:31] # first 30 similar docs if sim_i != i) with CsvWriter(obj.directory, TopicSimilarity) as out: distances = squareform(pdist(phi.T, 'cosine')) out << (dict(a_id=topic_id(1, i), b_id=topic_id(1, sim_i), similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:] if sim_i != i) with CsvWriter(obj.directory, TermSimilarity) as out: distances = squareform(pdist(phi, 'cosine')) out << (dict(a_modality_id=1, a_id=i, b_modality_id=1, b_id=sim_i, similarity=1 - row[sim_i]) for i, row in enumerate(distances) for sim_i in row.argsort()[:21] # first 20 similar terms if sim_i != i)
def kcca(X, Y, kernel_x=gaussian_kernel, kernel_y=gaussian_kernel, eta=1.0): ''' カーネル正準相関分析 http://staff.aist.go.jp/s.akaho/papers/ibis00.pdf ''' n, p = X.shape n, q = Y.shape Kx = DIST.squareform(DIST.pdist(X, kernel_x)) Ky = DIST.squareform(DIST.pdist(Y, kernel_y)) J = np.eye(n) - np.ones((n, n)) / n M = np.dot(np.dot(Kx.T, J), Ky) / n L = np.dot(np.dot(Kx.T, J), Kx) / n + eta * Kx N = np.dot(np.dot(Ky.T, J), Ky) / n + eta * Ky sqx = LA.sqrtm(LA.inv(L)) sqy = LA.sqrtm(LA.inv(N)) a = np.dot(np.dot(sqx, M), sqy.T) A, s, Bh = LA.svd(a, full_matrices=False) B = Bh.T # U = np.dot(np.dot(A.T, sqx), X).T # V = np.dot(np.dot(B.T, sqy), Y).T return s, A, B
def edge_matrix(abs_times, camera_types): """Returns the edge matrix E in non-squareform, calculated using pdist. We consider an edge between two metadata entries to exist if: a) those entries were taken within 120 seconds of each other, AND b) those entries came from different cameras. Note: it is recommended that camera_types contains an index for metadata entries that did not have a camera type listed. Args: abs_times (numpy.array): N-dimensional array of floats of absolute times, in seconds, that images were taken. camera_types (numpy.array): N-dimensional array of ints corresponding to the camera types that were used. Returns: The edge matrix, E. """ assert len(abs_times) == len(camera_types) T = pdist(abs_times) T = np.asarray(T < 120, dtype=bool) C = pdist(camera_types) C = np.asarray(C, dtype=bool) E = T & C return E, T, C
def get_monk_human_pspace(): DATPATH = '/mindhive/dicarlolab/u/rishir/monkey_objectome/monkey_behaviour/tmpmonk.mat' dat = scipy.io.loadmat(DATPATH) monkdata = dat['monkdata'] humdata = dat['humdata'] models_oi = dat['models_oi'][0] mcent_symm = get_pspace_centers(monkdata, dim=20, symmetrize=True) hcent_asymm, obj_ind = get_precomputed_pspace_centers(models_oi) hcent_asymm_all, tmp = get_precomputed_pspace_centers() mcent_symm_d = d.pdist(mcent_symm) hcent_asymm_d = d.pdist(hcent_asymm) rho = utils.nnan_consistency(mcent_symm_d, hcent_asymm_d) print rho mat_data = {} mat_data['mcent_symm'] = mcent_symm mat_data['hcent_asymm'] = hcent_asymm mat_data['mcent_symm_d'] = mcent_symm_d mat_data['hcent_asymm_d'] = hcent_asymm_d mat_data['hcent_asymm_all'] = hcent_asymm_all mat_data['models_oi'] = models_oi mat_data['obj_ind'] = obj_ind scipy.io.savemat('pspace_res.mat', mat_data)
def get_distance_distro(tracked_objects, sample_size=None, repeat=1, neighbours=0): ''' Given an 2d array of coordinates, random sample from it calculate pair-wise distances. tracked_objects: input 2d array. Each row is a coordinate. sample_size: the size of random sample to be withdrawn, and if is None, calculate pair-wise distance of the whole input. repeat: number of random samples to be drawn. neighbours: number of nearest neighbours to include in the analysis return: a 1d array of distances, pooled from all samples. ''' if sample_size is None: sample_size = tracked_objects.shape[0] dist = [] ind_array = np.arange(tracked_objects.shape[0]) for i in range(repeat): np.random.shuffle(ind_array) selected_objects = tracked_objects[ind_array[:sample_size],:] if neighbours <= 0: dist.append(pdist(selected_objects)) else: dist_all = squareform(pdist(selected_objects)) dist_all.partition(neighbours) dist_all = dist_all[:,:neighbours+1] dist.append(dist_all[dist_all > 0]) dist = np.hstack(dist) return dist
def getDistances(x, attr, var, cidx, didx, cheader): """ This creates the distance array for only discrete or continuous data with no missing data """ from scipy.spatial.distance import pdist, squareform #-------------------------------------------------------------------------- def pre_normalize(x): idx = 0 for i in cheader: cmin = attr[i][2] diff = attr[i][3] x[:,idx] -= cmin x[:,idx] /= diff idx += 1 return x #-------------------------------------------------------------------------- dtype = var['dataType'] numattr = var['NumAttributes'] if(dtype == 'discrete'): return squareform(pdist(x,metric='hamming')) if(dtype == 'mixed'): d_dist = squareform(pdist(x[:,didx],metric='hamming')) xc = pre_normalize(x[:,cidx]) c_dist = squareform(pdist(xc,metric='cityblock')) return np.add(d_dist, c_dist) / numattr else: #(dtype == 'continuous'): return squareform(pdist(pre_normalize(x),metric='cityblock'))
def _compute_AB(x, y, index): xa = np.atleast_2d(x) ya = np.atleast_2d(y) if xa.ndim > 2 or ya.ndim > 2: raise ValueError("x and y must be 1d or 2d array_like objects") if xa.shape[0] == 1: xa = xa.T if ya.shape[0] == 1: ya = ya.T if xa.shape[0] != ya.shape[0]: raise ValueError("x and y must have the same sample sizes") if index <= 0 or index > 2: raise ValueError("index must be in (0, 2]") # compute A a_kl = squareform(pdist(xa, 'euclidean')**index) a_k = np.mean(a_kl, axis=1).reshape(-1, 1) a_l = a_k.T a = np.mean(a_kl) A = a_kl - a_k - a_l + a # compute B b_kl = squareform(pdist(ya, 'euclidean')**index) b_k = np.mean(b_kl, axis=1).reshape(-1, 1) b_l = b_k.T b = np.mean(b_kl) B = b_kl - b_k - b_l + b return A, B
def kernel_ndvi_outlier_search(band_subset_outlier, sample_k_vol, sample_k_geom, sample_c1, sample_cos_i, sample_slope, sample_ndvi, sample_topo_msk, sample_img_tag, idxRand_dict, hyObj_pointer_dict_list, image_smooth): wave_all_samples = np.empty((len(band_subset_outlier), 0), float) img_name_list = [x.file_name for x in hyObj_pointer_dict_list] group_dict = {} group_dict["img_name_list"] = img_name_list # print(band_subset_outlier) group_dict["band_subset"] = band_subset_outlier for i in range(len(hyObj_pointer_dict_list)): print(hyObj_pointer_dict_list[i].file_name) if hyObj_pointer_dict_list[i].file_type == "ENVI": if hyObj_pointer_dict_list[i].interleave == 'bsq': spec_data = hyObj_pointer_dict_list[i].data[:, idxRand_dict[i][0], idxRand_dict[i][1]].transpose() elif hyObj_pointer_dict_list[i].interleave == 'bil': spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], :, idxRand_dict[i][1]] # hyObj.interleave=='bip': else: spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], idxRand_dict[i][1], :] elif hyObj_pointer_dict_list[i].file_type == "HDF": spec_data = hyObj_pointer_dict_list[i].data[idxRand_dict[i][0], idxRand_dict[i][1], :] else: return None wave_samples = spec_data[:, band_subset_outlier] wave_samples = wave_samples / image_smooth[i][band_subset_outlier] sub_index_img_tag = (sample_img_tag == i + 1) sample_cos_i_sub = sample_cos_i[sub_index_img_tag] sample_slope_sub = sample_slope[sub_index_img_tag] sample_c1_sub = sample_c1[sub_index_img_tag] topo_mask_sub = (sample_cos_i_sub > COSINE_I_MIN_THRESHOLD) & (sample_slope_sub > SLOPE_MIN_THRESHOLD) for iband in range(len(band_subset_outlier)): wave_samples_band = wave_samples[:, iband] topo_coeff, _, _ = generate_topo_coeff_band(wave_samples_band, (wave_samples_band > REFL_MIN_THRESHOLD) & (wave_samples_band < REFL_MAX_THRESHOLD) & topo_mask_sub, sample_cos_i_sub, non_negative=True) correctionFactor = (sample_c1_sub + topo_coeff) / (sample_cos_i_sub + topo_coeff) correctionFactor = correctionFactor * topo_mask_sub + 1.0 * (1 - topo_mask_sub) wave_samples[:, iband] = wave_samples_band * correctionFactor wave_all_samples = np.hstack((wave_all_samples, wave_samples.T)) ndvi_mask = (sample_ndvi > 0.15) & (sample_ndvi <= 0.95) obs_mask = np.isfinite(sample_k_vol) & np.isfinite(sample_k_geom) temp_mask = (wave_all_samples[0] > REFL_MIN_THRESHOLD) & (wave_all_samples[0] < REFL_MAX_THRESHOLD) & (obs_mask) & (ndvi_mask) for iband in range(len(band_subset_outlier)): new_df = pd.DataFrame({'k_geom': sample_k_geom[temp_mask], 'k_vol': sample_k_vol[temp_mask], 'reflectance': wave_all_samples[iband, temp_mask], 'line_id': sample_img_tag[temp_mask], "NDVI": sample_ndvi[temp_mask]}) new_df['ndvi_cut_bins'] = pd.cut(new_df['NDVI'], bins=[0.15, 0.4, 0.7, 0.95], labels=['ndvi_1', 'ndvi_2', 'ndvi_3']) new_df['geom_cut_bins'] = pd.cut(new_df['k_geom'], bins=np.percentile(sample_k_geom[temp_mask], [5, 33, 67, 95]), # [5,33,67,95] #[5,25,50,75,95] labels=['k_geom_1', 'k_geom_2', 'k_geom_3']) # ,'k_geom_4' new_df['vol_cut_bins'] = pd.cut(new_df['k_vol'], bins=np.percentile(sample_k_vol[temp_mask], [5, 33, 67, 95]), # [5,25,50,75,95] # [5,33,67,95] labels=['k_vol_1', 'k_vol_2', 'k_vol_3']) # 'k_vol_4' new_df_bin_group_mean = new_df.groupby(['vol_cut_bins', 'geom_cut_bins', 'ndvi_cut_bins', 'line_id']).median() # mean() new_df_bin_group_mean.reset_index(inplace=True) n_bin = new_df_bin_group_mean.shape[0] // len(hyObj_pointer_dict_list) ss = new_df_bin_group_mean["reflectance"].values bin_avg_array = np.reshape(ss, (n_bin, len(hyObj_pointer_dict_list))) bin_mean = np.nanmedian(bin_avg_array, axis=1) inds = np.where(np.isnan(bin_avg_array)) # Place column means in the indices. Align the arrays using take bin_avg_array[inds] = np.take(bin_mean, inds[0]) bin_avg_array = bin_avg_array / bin_mean[:, np.newaxis] bin_avg_array = bin_avg_array[~np.isnan(bin_avg_array[:, 0])] # Y = pdist(bin_avg_array.T, 'seuclidean', V=None) Y = pdist(bin_avg_array.T, 'euclidean', V=None) # Y = pdist(bin_avg_array.T, 'canberra') print(Y) return_dict = {} # H_s = hierarchy.single(Y) H_s = hierarchy.complete(Y) T_ = hierarchy.fcluster(H_s, 1.2, criterion='distance') print("Cluster thres 1.2", T_) return_dict["Cluster thres 1.2"] = T_.tolist() T_ = hierarchy.fcluster(H_s, 1.0, criterion='distance') print("Cluster thres 1.0", T_) return_dict["Cluster thres 1.0"] = T_.tolist() T_ = hierarchy.fcluster(H_s, 0.85, criterion='distance') print("Cluster thres 0.85", T_) return_dict["Cluster thres 0.9"] = T_.tolist() return_dict["distance of metrics"] = Y.tolist() major_label_id = np.bincount(np.array(T_)).argmax() outlier_img_tag = (np.array(T_) != major_label_id) return_dict["outlier_image_bool"] = outlier_img_tag.astype(int).tolist() return_dict["outlier_count"] = int(np.count_nonzero(outlier_img_tag)) group_dict['b' + str(iband + 1)] = return_dict return group_dict
def _initialize_variogram_model( X, y, variogram_model, variogram_model_parameters, variogram_function, nlags, weight, coordinates_type, ): """Initializes the variogram model for kriging. If user does not specify parameters, calls automatic variogram estimation routine. Returns lags, semivariance, and variogram model parameters. Parameters ---------- X: ndarray float array [n_samples, n_dim], the input array of coordinates y: ndarray float array [n_samples], the input array of values to be kriged variogram_model: str user-specified variogram model to use variogram_model_parameters: list user-specified parameters for variogram model variogram_function: callable function that will be called to evaluate variogram model (only used if user does not specify variogram model parameters) nlags: int integer scalar, number of bins into which to group inter-point distances weight: bool boolean flag that indicates whether the semivariances at smaller lags should be weighted more heavily in the automatic variogram estimation coordinates_type: str type of coordinates in X array, can be 'euclidean' for standard rectangular coordinates or 'geographic' if the coordinates are lat/lon Returns ------- lags: ndarray float array [nlags], distance values for bins into which the semivariances were grouped semivariance: ndarray float array [nlags], averaged semivariance for each bin variogram_model_parameters: list parameters for the variogram model, either returned unaffected if the user specified them or returned from the automatic variogram estimation routine """ # distance calculation for rectangular coords now leverages # scipy.spatial.distance's pdist function, which gives pairwise distances # in a condensed distance vector (distance matrix flattened to a vector) # to calculate semivariances... if coordinates_type == "euclidean": d = pdist(X, metric="euclidean") g = 0.5 * pdist(y[:, None], metric="sqeuclidean") # geographic coordinates only accepted if the problem is 2D # assume X[:, 0] ('x') => lon, X[:, 1] ('y') => lat # old method of distance calculation is retained here... # could be improved in the future elif coordinates_type == "geographic": if X.shape[1] != 2: raise ValueError( "Geographic coordinate type only supported for 2D datasets.") x1, x2 = np.meshgrid(X[:, 0], X[:, 0], sparse=True) y1, y2 = np.meshgrid(X[:, 1], X[:, 1], sparse=True) z1, z2 = np.meshgrid(y, y, sparse=True) d = great_circle_distance(x1, y1, x2, y2) g = 0.5 * (z1 - z2)**2.0 indices = np.indices(d.shape) d = d[(indices[0, :, :] > indices[1, :, :])] g = g[(indices[0, :, :] > indices[1, :, :])] else: raise ValueError("Specified coordinate type '%s' is not supported." % coordinates_type) # Equal-sized bins are now implemented. The upper limit on the bins # is appended to the list (instead of calculated as part of the # list comprehension) to avoid any numerical oddities # (specifically, say, ending up as 0.99999999999999 instead of 1.0). # Appending dmax + 0.001 ensures that the largest distance value # is included in the semivariogram calculation. dmax = np.amax(d) dmin = np.amin(d) dd = (dmax - dmin) / nlags bins = [dmin + n * dd for n in range(nlags)] dmax += 0.001 bins.append(dmax) # This old binning method was experimental and doesn't seem # to work too well. Bins were computed such that there are more # at shorter lags. This effectively weights smaller distances more # highly in determining the variogram. As Kitanidis points out, # the variogram fit to the data at smaller lag distances is more # important. However, the value at the largest lag probably ends up # being biased too high for the larger values and thereby throws off # automatic variogram calculation and confuses comparison of the # semivariogram with the variogram model. # # dmax = np.amax(d) # dmin = np.amin(d) # dd = dmax - dmin # bins = [dd*(0.5**n) + dmin for n in range(nlags, 1, -1)] # bins.insert(0, dmin) # bins.append(dmax) lags = np.zeros(nlags) semivariance = np.zeros(nlags) for n in range(nlags): # This 'if... else...' statement ensures that there are data # in the bin so that numpy can actually find the mean. If we # don't test this first, then Python kicks out an annoying warning # message when there is an empty bin and we try to calculate the mean. if d[(d >= bins[n]) & (d < bins[n + 1])].size > 0: lags[n] = np.mean(d[(d >= bins[n]) & (d < bins[n + 1])]) semivariance[n] = np.mean(g[(d >= bins[n]) & (d < bins[n + 1])]) else: lags[n] = np.nan semivariance[n] = np.nan lags = lags[~np.isnan(semivariance)] semivariance = semivariance[~np.isnan(semivariance)] # a few tests the make sure that, if the variogram_model_parameters # are supplied, they have been supplied as expected... # if variogram_model_parameters was not defined, then estimate the variogram if variogram_model_parameters is not None: if variogram_model == "linear" and len( variogram_model_parameters) != 2: raise ValueError( "Exactly two parameters required for linear variogram model.") elif (variogram_model in [ "power", "spherical", "exponential", "gaussian", "hole-effect" ] and len(variogram_model_parameters) != 3): raise ValueError("Exactly three parameters required for " "%s variogram model" % variogram_model) else: if variogram_model == "custom": raise ValueError("Variogram parameters must be specified when " "implementing custom variogram model.") else: variogram_model_parameters = _calculate_variogram_model( lags, semivariance, variogram_model, variogram_function, weight) return lags, semivariance, variogram_model_parameters
def dist(self, X, Y=None): if Y is X or Y is None: d = scidist.pdist(X, self.metric) return scidist.squareform(d) else: return scidist.cdist(X, Y, self.metric)
labels_path = 'labels_comma.csv' #load the .csv files M_str, nrRows, nrCols = read_tadpole.load_csv_no_header(path_dataset_matrix) Wrow, _, _ = read_tadpole.load_csv_no_header(path_dataset_affinity_matrix) labels, _, _ = read_tadpole.load_csv_no_header(labels_path) #parameters/preprocessing step that do not change during the running Wrow = preprocessing_dataset.str_to_float(Wrow) M_init = preprocessing_dataset.normalization(M_str) labels = preprocessing_dataset.str_to_float(labels) M = np.concatenate((M_init, labels), axis=1) #ADD A SIMILARITY MEASURE TO THE GRAPH # Calculate all pairwise distances distv = distance.pdist(M, metric='correlation') # Convert to a square symmetric distance matrix dist = distance.squareform(distv) sigma = np.mean(dist) # Get affinity from similarity matrix sparse_graph = np.exp(-dist**2 / (2 * sigma**2)) #Wrow=Wrow*sparse_graph Wrow = preprocessing_dataset.normalize_adj(Wrow) #creation of a mask for the features: 1 for features and 0 for labels M_features_ones = np.ones(M_init.shape) M_labels_zeros = np.zeros(labels.shape) mask_features = np.concatenate((M_features_ones, M_labels_zeros), axis=1) #computation of the normalized laplacians Lrow = csgraph.laplacian(Wrow, normed=True)
def __init__(self, points): self.points = points self.dm = squareform(pdist(points))
def CosineScore(M): cos_M = squareform(pdist(M, 'cosine')) alpha_cos = softmax(cos_M, axis=0) return np.sum(alpha_cos, axis=1)
def manhattenScore(M): man_M = squareform(pdist(M, 'cityblock')) alpha_man = softmax(man_M, axis=0) return np.sum(alpha_man, axis=1)
def euclideanScore(M): #Euclidean distance euc_M = squareform(pdist(M, 'euclidean')) alpha_euc = softmax(euc_M, axis=0) return np.sum(alpha_euc, axis=1)
def process_batch(self, lines): """Helper function to convert raw lines into a mini-batch as a DotDict. """ batch_edges = [] batch_edges_values = [] batch_edges_target = [] # Binary classification targets (0/1) batch_nodes = [] batch_nodes_target = [ ] # Multi-class classification targets (`num_nodes` classes) batch_nodes_coord = [] batch_tour_nodes = [] batch_tour_len = [] for line_num, line in enumerate(lines): line = line.split(" ") # Split into list # Compute signal on nodes nodes = np.ones(self.num_nodes) # All 1s for TSP... # Convert node coordinates to required format nodes_coord = [] for idx in range(0, 2 * self.num_nodes, 2): nodes_coord.append([float(line[idx]), float(line[idx + 1])]) # Compute distance matrix W_val = squareform(pdist(nodes_coord, metric=self.metric)) # Compute adjacency matrix if self.num_neighbors == -1: W = np.ones((self.num_nodes, self.num_nodes)) # Graph is fully connected else: W = np.zeros((self.num_nodes, self.num_nodes)) # Determine k-nearest neighbors for each node knns = np.argpartition(W_val, kth=self.num_neighbors, axis=-1)[:, self.num_neighbors::-1] # Make connections for idx in range(self.num_nodes): W[idx][knns[idx]] = 1 np.fill_diagonal(W, 2) # Special token for self-connections # Convert tour nodes to required format # Don't add final connection for tour/cycle tour_nodes = [ int(node) - 1 for node in line[line.index('output') + 1:-1] ][:-1] # Compute node and edge representation of tour + tour_len tour_len = 0 nodes_target = np.zeros(self.num_nodes) edges_target = np.zeros((self.num_nodes, self.num_nodes)) for idx in range(len(tour_nodes) - 1): i = tour_nodes[idx] j = tour_nodes[idx + 1] nodes_target[ i] = idx # node targets: ordering of nodes in tour edges_target[i][j] = 1 edges_target[j][i] = 1 tour_len += W_val[i][j] # Add final connection of tour in edge target nodes_target[j] = len(tour_nodes) - 1 edges_target[j][tour_nodes[0]] = 1 edges_target[tour_nodes[0]][j] = 1 tour_len += W_val[j][tour_nodes[0]] # Concatenate the data batch_edges.append(W) batch_edges_values.append(W_val) batch_edges_target.append(edges_target) batch_nodes.append(nodes) batch_nodes_target.append(nodes_target) batch_nodes_coord.append(nodes_coord) batch_tour_nodes.append(tour_nodes) batch_tour_len.append(tour_len) # From list to tensors as a DotDict batch = DotDict() batch.edges = np.stack(batch_edges, axis=0) batch.edges_values = np.stack(batch_edges_values, axis=0) batch.edges_target = np.stack(batch_edges_target, axis=0) batch.nodes = np.stack(batch_nodes, axis=0) batch.nodes_target = np.stack(batch_nodes_target, axis=0) batch.nodes_coord = np.stack(batch_nodes_coord, axis=0) batch.tour_nodes = np.stack(batch_tour_nodes, axis=0) batch.tour_len = np.stack(batch_tour_len, axis=0) return batch
newick = final_tree.to_newick() tree = Phylo.read(StringIO(newick), 'newick') Phylo.draw_graphviz(tree, prog='neato') plt.savefig("%s.png" % name, dpi=200, bbox_inches='tight') X += np.random.normal(scale=0.01, size=X.shape) pca = PCA(2) pca.fit(X) # X = pca.transform(X) N, D = X.shape C = pdist(X) tree = to_tree(single(C)) def construct_node(snode): if snode.left is None and snode.right is None: return TreeLeaf(snode.get_id()) node = TreeNode() node.add_child(construct_node(snode.left)) node.add_child(construct_node(snode.right)) return node root = construct_node(tree) linkage_tree = Tree(root=root) plot_tree(linkage_tree, 'linkage_induced')
def add_point(self, newpt=[], newcontour=False): zpos = self.image.list_idx pts = np.array(self.points) if len(pts) >= 3: pts = pts[pts[:, 2] == zpos, :] if len(pts) < 3: if len(newpt) > 0: self.points.append([newpt[0], newpt[1], zpos, 0]) else: thr = 1000. if newcontour == False: cid = np.unique(pts[:, 3]) if len(newpt) > 0: #### add a point dst = np.array(scipydist.cdist([newpt], pts[:, :2])[0]) rg = np.arange(len(dst), dtype=int) + 1 for ci in cid: idx = np.where(pts[:, 3] == ci)[0] rg[idx[-1]] = idx[0] #print rg dst += dst[rg] mi = np.argmin(dst) ci = pts[mi, 3] pts = np.insert(pts, mi + 1, np.append(newpt, [zpos, ci]), axis=0) allpts = [] for ci in cid: #### a simple tsp solver... idx = np.where(pts[:, 3] == ci)[0] pp = pts[idx].copy() path = np.arange(len(pp), dtype=int) dmat = scipydist.squareform(scipydist.pdist(pp[:, :2])) dmat += np.eye(len(dmat)) * thr if len(pp) > 3: niter = 2000 else: niter = 0 calc_dist = lambda path: np.sum([ dmat[path[i], path[(i + 1) % len(path)]] for i in range(len(path)) ]) dst = calc_dist(path) nochange = 0 for k in range(niter): p0 = path.copy() i0, i1 = np.sort(np.random.randint(0, len(pp), 2)) if abs(i0 - i1) % len(path) < 2: continue path = np.hstack([ path[:i0 + 1], path[i0 + 1:i1 + 1][::-1], path[i1 + 1:] ]) d = calc_dist(path) if d >= dst: path = p0 nochange += 1 if nochange > 200: break else: dst = d nochange = 0 allpts.extend([[pp[i][0], pp[i][1], zpos, ci] for i in path]) self.points = [p for p in self.points if p[2] != zpos] self.points.extend(allpts) else: #### start a new contour ci = np.max(pts[:, 3]) + 1 self.points.append([newpt[0], newpt[1], zpos, ci]) np.savetxt("pts.save", self.points, fmt='%d')
def dRMSD(x, y): return norm(pdist(x) - pdist(y))/((len(x)*(len(x)-1)/2)**(0.5))
for i, data in enumerate(dataloader, 0): #print('{}/{}'.format(i*bs, nsamples)) if i*bs > nsamples: break else: inputs, _ = data out = newmodel.forward(inputs.to(device)) if i == 0: Out = out.view(inputs.shape[0], -1).cpu().data else : Out = torch.cat((Out, out.view(inputs.shape[0], -1).cpu().data),0) Out = Out.detach() del out # normal ID dist = squareform(pdist(Out,method)) est = estimate(dist,verbose=verbose) id_ori = est[2] ID_original.append(id_ori) # pca data pca = PCA() Out = StandardScaler().fit_transform(Out) pca.fit(Out) # the n.of eigenvalues should be the minimum between the n. of features # and the n. of data points neigs = len(pca.singular_values_) # id given by the pca : 90 % of variance id_pc = get_pca_dim(pca.explained_variance_ratio_,th)
def metric(self, field): """ Compute metric(s) for a single field Parameters ---------- field : numpy array of shape (npx,npx) - npx is number of pixels Cloud mask field. Returns ------- D0 : float Mean geometric nearest neighbour distance between objects. scai : float Simple Convective Aggregation Index. """ cmlab, num = label(field, return_num=True, connectivity=self.con) regions = regionprops(cmlab) xC = [] yC = [] for i in range(num): props = regions[i] if props.area > self.areaMin: y0, x0 = props.centroid xC.append(x0) yC.append(y0) pos = np.vstack((np.asarray(xC), np.asarray(yC))).T nCl = pos.shape[0] # print('Number of regions: ',pos.shape[0],'/',num) if pos.shape[0] < 1: print("No sufficiently large cloud objects, returning nan") return float("nan"), float("nan") if self.bc == "periodic": dist_sq = np.zeros(nCl * (nCl - 1) // 2) # to match the result of pdist for d in range(field.ndim): box = field.shape[d] // 2 pos_1d = pos[:, d][:, np.newaxis] dist_1d = sd.pdist(pos_1d) dist_1d[dist_1d > box * 0.5] -= box dist_sq += dist_1d**2 dist = np.sqrt(dist_sq) else: dist = sd.pdist(pos) D0 = gmean(dist) Nmax = field.shape[0] * field.shape[1] / 2 scai = num / Nmax * D0 / self.L * 1000 # Force SCAI to zero if there is only 1 region (completely aggregated) # This is not strictly consistent with the metric (as D0 is # objectively undefined), but is consistent with its spirit if pos.shape[0] == 1: scai = 0 if self.plot: plt.imshow(field, "gray") plt.title("scai: " + str(round(scai, 3))) plt.show() return D0, scai
def DB_cluster(density_clean, distance_cutoff_percent=0.02, delta_cutoff=0.5, interactive=False): distance_mtx_condensed = pdist(density_clean[:, 0:-1]) density = density_clean[:, -1] cluster_center_index = [] num_datapoint = len(density) cluster = np.full(num_datapoint, -1) num_cluster = 0 distance_cutoff_index = math.ceil(distance_cutoff_percent * len(distance_mtx_condensed)) distance_cutoff = np.sort(distance_mtx_condensed)[distance_cutoff_index] rho, rho_order, nearest_neighbor, delta = calculate_rho_delta( distance_mtx_condensed, density, distance_cutoff) if interactive: global fig, axis, col fig, axis = plt.subplots(dpi=200) mask = delta > delta_cutoff color = np.array([1, 0, 0, 1] * num_datapoint).reshape( -1, 4) #original poitns: all red for index, decider in enumerate(mask): if decider: color[index] = [0, 1, 0, 1] #color those above threshold gree col = axis.scatter(rho, delta, c=color, marker='.', picker=True) axis.set_title("Decision Graph", fontsize='xx-large') axis.set_ylabel(r"$\delta$", fontsize='x-large') axis.set_xlabel(r"$\rho$", fontsize='x-large') fig.canvas.mpl_connect('pick_event', onpick3) plt.show() for index, point_color in enumerate(col.get_facecolors()): point_color = point_color.flatten() if not point_color[0]: #if green, meaning selected num_cluster += 1 cluster[index] = num_cluster cluster_center_index.append(index) plt.close('all') else: for i in range(num_datapoint): if delta[i] >= delta_cutoff: num_cluster += 1 cluster[i] = num_cluster cluster_center_index.append(i) for i in range(num_datapoint): index = rho_order[i] if cluster[index] == -1: cluster[index] = cluster[nearest_neighbor[index]] assert (not np.any(cluster == -1)) return rho, delta, cluster, cluster_center_index, distance_mtx_condensed, distance_cutoff
def compute_ssm(X, metric="seuclidean"): """Computes the self-similarity matrix of X.""" D = distance.pdist(X, metric=metric) D = distance.squareform(D) D /= D.max() return 1 - D
def proclus(X, k=2, l=3, minDeviation=0.1, A=30, B=3, niters=30, seed=1234, verboseFlag=True): """ Run PROCLUS on a database to obtain a set of clusters and dimensions associated with each one. Parameters: ---------- - X: the data set - k: the desired number of clusters - l: average number of dimensions per cluster - minDeviation: for selection of bad medoids - A: constant for initial set of medoids - B: a smaller constant than A for the final set of medoids - niters: maximum number of iterations for the second phase - seed: seed for the RNG - verboseFlag: True/False flag for verbosity """ np.random.seed(seed) N, d = X.shape if B > A: raise Exception("B has to be smaller than A.") if l < 2: raise Exception("l must be >=2.") ############################### # 1.) Initialization phase ############################### # first find a superset of the set of k medoids by random sampling idxs = np.arange(N) np.random.shuffle(idxs) S = idxs[0:(A * k)] M = greedy(X, S, B * k) ############################### # 2.) Iterative phase ############################### BestObjective = np.inf # choose a random set of k medoids from M: Mcurr = np.random.permutation(M)[0:k] # M current Mbest = None # Best set of medoids found D = squareform(pdist(X)) # precompute the euclidean distance matrix it = 0 # iteration counter L = [] # locality sets of the medoids, i.e., points within delta_i of m_i. Dis = [] # important dimensions for each cluster assigns = [] # cluster membership assignments while True: it += 1 L = [] for i in range(len(Mcurr)): mi = Mcurr[i] # compute delta_i, the distance to the nearest medoid of m_i: di = D[mi, np.setdiff1d(Mcurr, mi)].min() # compute L_i, points in sphere centered at m_i with radius d_i L.append(np.where(D[mi] <= di)[0]) # find dimensions: Dis = findDimensions(X, k, l, L, Mcurr) # form the clusters: assigns = assignPoints(X, Mcurr, Dis) # evaluate the clusters: ObjectiveFunction = evaluateClusters(X, assigns, Dis, Mcurr) badM = [] # bad medoids Mold = Mcurr.copy() if ObjectiveFunction < BestObjective: BestObjective = ObjectiveFunction Mbest = Mcurr.copy() # compute the bad medoids in Mbest: badM = computeBadMedoids(X, assigns, Dis, Mcurr, minDeviation) if verboseFlag is True: print("bad medoids:") print(badM) if len(badM) > 0: # replace the bad medoids with random points from M: if verboseFlag is True: print("old mcurr:") print(Mcurr) Mavail = np.setdiff1d(M, Mbest) newSel = np.random.choice(Mavail, size=len(badM), replace=False) Mcurr = np.setdiff1d(Mbest, badM) Mcurr = np.union1d(Mcurr, newSel) if verboseFlag is True: print("new mcurr:") print(Mcurr) if verboseFlag is True: print("finished iter: %d" % it) if np.allclose(Mold, Mcurr) or it >= niters: break if verboseFlag is True: print("finished iterative phase...") ############################### # 3.) Refinement phase ############################### # compute a new L based on assignments: L = [] for i in range(len(Mcurr)): mi = Mcurr[i] L.append(np.where(assigns == mi)[0]) Dis = findDimensions(X, k, l, L, Mcurr) assigns = assignPoints(X, Mcurr, Dis) # handle outliers: # smallest Manhattan segmental distance of m_i to all (k-1) # other medoids with respect to D_i: deltais = np.zeros(k) for i in range(k): minDist = np.inf for j in range(k): if j != i: dist = manhattanSegmentalDist(X[Mcurr[i]], X[Mcurr[j]], Dis[i]) if dist < minDist: minDist = dist deltais[i] = minDist # mark as outliers the points that are not within delta_i of any m_i: for i in range(len(assigns)): clustered = False for j in range(k): d = manhattanSegmentalDist(X[Mcurr[j]], X[i], Dis[j]) if d <= deltais[j]: clustered = True break if not clustered: # print "marked an outlier" assigns[i] = -1 return (Mcurr, Dis, assigns)
def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. These metrics do not support sparse matrix inputs. Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are valid scipy.spatial.distance metrics), the scikit-learn implementation will be used, which is faster and has support for sparse matrices (except for 'cityblock'). For a verbose description of the metrics from scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features], optional An optional second feature array. Only allowed if metric != "precomputed". metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is "precomputed", X is assumed to be a distance matrix. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. n_jobs : int The number of jobs to use for the computation. This works by breaking down the pairwise matrix into n_jobs even slices and computing them in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. """ if (metric not in _VALID_METRICS and not callable(metric) and metric != "precomputed"): raise ValueError("Unknown metric %s. " "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: if issparse(X) or issparse(Y): raise TypeError("scipy distance metrics do not" " support sparse matrices.") dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None X, Y = check_pairwise_arrays(X, Y, dtype=dtype) if n_jobs == 1 and X is Y: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) func = partial(distance.cdist, metric=metric, **kwds) return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
def merge_tracklets(video_id, tracks, obj_id=0, obj_sim_thr=0.9): """Merge tracklets based on feature similarity """ # get list of boxes for each track track_boxes = collections.defaultdict(list) for fn in tracks: for x1, y1, x2, y2, tid in tracks[fn][obj_id]: track_boxes[tid].append( [int(fn), int(x1), int(y1), int(x2), int(y2)]) tids = list(track_boxes.keys()) # load frames, compute features frames = sth_dataset.load_video_frames(video_id) track_feats = track_histogram(frames, track_boxes) # compute pair-wise distances to obtain merge candidates feats = np.array(list(track_feats.values())) similarity = 1 - squareform(pdist(feats, metric='cosine')) # compute similarity pairs cliques = [] idx1, idx2 = np.where(similarity - np.eye(len(track_feats)) > obj_sim_thr) for i1, i2 in zip(idx1, idx2): t1, t2 = tids[i1], tids[i2] new = True for clq in cliques: if t1 in clq and t2 in clq: new = False elif t1 in clq and t2 not in clq: new = False clq.append(t2) elif t1 not in clq and t2 in clq: new = False clq.append(t1) if new: cliques.append([t1, t2]) # convert tids to cliq ids tid2cliqid = {} for c, clq in enumerate(cliques): for tid in clq: tid2cliqid[tid] = c # fill in the singleton tracks for tid in tids: if tid not in tid2cliqid: tid2cliqid[tid] = len( tid2cliqid ) # should technically be a new cliq-id, but this works :) # update track ids for fn in tracks: tids_in_fn = [] keep = [] tids_in_fn = [] for k, box in enumerate(tracks[fn][obj_id]): this_tid = tid2cliqid[box[-1]] if this_tid in tids_in_fn: # ignore duplicated tid pass else: tracks[fn][obj_id][k][-1] = this_tid tids_in_fn.append(this_tid) keep.append(k) # delete duplicated tids tracks[fn][obj_id] = tracks[fn][obj_id][keep, :] return tracks
def pairwise_distances(X, Y=None, metric="euclidean", **kwds): """ Compute the distance matrix from a vector array X and optional Y. This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead. This method provides a safe way to take a distance matrix as input, while preserving compatability with many other algorithms that take a vector array. If Y is given (default is None), then the returned matrix is the pairwise distance between the arrays from both X and Y. Please note that support for sparse matrices is currently limited to those metrics listed in pairwise.pairwise_distance_functions. Valid values for metric are: - from scikits.learn: ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeucludean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. Note in the case of 'euclidean' and 'cityblock' (which are valid scipy.spatial.distance metrics), the values will use the scikits.learn implementation, which is faster and has support for sparse matrices. For a verbose description of the metrics from scikits.learn, see the __doc__ of the sklearn.pairwise.distance_metrics function. Parameters ---------- X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ [n_samples_a, n_features] otherwise Array of pairwise distances between samples, or a feature array. Y : array [n_samples_b, n_features] A second feature array only if X has shape [n_samples_a, n_features]. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter, or a metric listed in pairwise.pairwise_distance_functions. If metric is "precomputed", X is assumed to be a distance matrix and must be square. Alternatively, if metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays from X as input and return a value indicating the distance between them. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] A distance matrix D such that D_{i, j} is the distance between the ith and jth vectors of the given matrix X, if Y is None. If Y is not None, then D_{i, j} is the distance between the ith array from X and the jth array from Y. """ if metric == "precomputed": if X.shape[0] != X.shape[1]: raise ValueError("X is not square!") return X elif metric in pairwise_distance_functions: return pairwise_distance_functions[metric](X, Y, **kwds) elif callable(metric): # Check matrices first (this is usually done by the metric). X, Y = check_pairwise_arrays(X, Y) n_x, n_y = X.shape[0], Y.shape[0] # Calculate distance for each element in X and Y. D = np.zeros((n_x, n_y), dtype='float') for i in range(n_x): start = 0 if X is Y: start = i for j in range(start, n_y): # Kernel assumed to be symmetric. D[i][j] = metric(X[i], Y[j], **kwds) if X is Y: D[j][i] = D[i][j] return D else: # Note: the distance module doesn't support sparse matrices! if type(X) is csr_matrix: raise TypeError("scipy distance metrics do not" " support sparse matrices.") if Y is None: return distance.squareform(distance.pdist(X, metric=metric, **kwds)) else: if type(Y) is csr_matrix: raise TypeError("scipy distance metrics do not" " support sparse matrices.") return distance.cdist(X, Y, metric=metric, **kwds)
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=35, PLOT=False): ''' ARGUMENTS: - fileName: the name of the WAV file to be analyzed - numOfSpeakers the number of speakers (clusters) in the recording (<=0 for unknown) - mtSize (opt) mid-term window size - mtStep (opt) mid-term window step - stWin (opt) short-term window size - LDAdim (opt) LDA dimension (0 for no LDA) - PLOT (opt) 0 for not plotting the results 1 for plottingy ''' [Fs, x] = audioBasicIO.readAudioFile(fileName) x = audioBasicIO.stereo2mono(x) Duration = len(x) / Fs [ Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerAll")) [ Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 ] = aT.loadKNNModel(os.path.join("data", "knnSpeakerFemaleMale")) [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5)) MidTermFeatures2 = numpy.zeros( (MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1])) for i in range(MidTermFeatures.shape[1]): curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1 curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i] MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001 MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001 MidTermFeatures = MidTermFeatures2 # TODO # SELECT FEATURES: #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20]; # SET 0A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100]; # SET 0B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96, # 97,98, 99,100]; # SET 0C iFeaturesSelect = [ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 ] # SET 1A #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 1B #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 1C #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]; # SET 2A #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100]; # SET 2B #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100]; # SET 2C #iFeaturesSelect = range(100); # SET 3 #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010 MidTermFeatures = MidTermFeatures[iFeaturesSelect, :] (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T]) MidTermFeaturesNorm = MidTermFeaturesNorm[0].T numOfWindows = MidTermFeatures.shape[1] # remove outliers: DistancesAll = numpy.sum(distance.squareform( distance.pdist(MidTermFeaturesNorm.T)), axis=0) MDistancesAll = numpy.mean(DistancesAll) iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0] # TODO: Combine energy threshold for outlier removal: #EnergyMin = numpy.min(MidTermFeatures[1,:]) #EnergyMean = numpy.mean(MidTermFeatures[1,:]) #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0 #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0] #print iNonOutLiers perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows MidTermFeaturesNormOr = MidTermFeaturesNorm MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers] # LDA dimensionality reduction: if LDAdim > 0: #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin)); # extract mid-term features with minimum step: mtWinRatio = int(round(mtSize / stWin)) mtStepRatio = int(round(stWin / stWin)) mtFeaturesToReduce = [] numOfFeatures = len(ShortTermFeatures) numOfStatistics = 2 #for i in range(numOfStatistics * numOfFeatures + 1): for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append([]) for i in range(numOfFeatures): # for each of the short-term features: curPos = 0 N = len(ShortTermFeatures[i]) while (curPos < N): N1 = curPos N2 = curPos + mtWinRatio if N2 > N: N2 = N curStFeatures = ShortTermFeatures[i][N1:N2] mtFeaturesToReduce[i].append(numpy.mean(curStFeatures)) mtFeaturesToReduce[i + numOfFeatures].append( numpy.std(curStFeatures)) curPos += mtStepRatio mtFeaturesToReduce = numpy.array(mtFeaturesToReduce) mtFeaturesToReduce2 = numpy.zeros( (mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1])) for i in range(mtFeaturesToReduce.shape[1]): curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1 curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2 [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1) [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2) mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i] mtFeaturesToReduce2[ mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001 mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1)::, i] = P2 + 0.0001 mtFeaturesToReduce = mtFeaturesToReduce2 mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :] #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010 (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T]) mtFeaturesToReduce = mtFeaturesToReduce[0].T #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0) #MDistancesAll = numpy.mean(DistancesAll) #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0] #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2] Labels = numpy.zeros((mtFeaturesToReduce.shape[1], )) LDAstep = 1.0 LDAstepRatio = LDAstep / stWin #print LDAstep, LDAstepRatio for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio) clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis( n_components=LDAdim) clf.fit(mtFeaturesToReduce.T, Labels) MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T if numOfSpeakers <= 0: sRange = range(2, 10) else: sRange = [numOfSpeakers] clsAll = [] silAll = [] centersAll = [] for iSpeakers in sRange: k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers) k_means.fit(MidTermFeaturesNorm.T) cls = k_means.labels_ means = k_means.cluster_centers_ # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T)) clsAll.append(cls) centersAll.append(means) silA = [] silB = [] for c in range(iSpeakers ): # for each speaker (i.e. for each extracted cluster) clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float( len(cls)) if clusterPerCent < 0.020: silA.append(0.0) silB.append(0.0) else: MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c] # get subset of feature vectors Yt = distance.pdist( MidTermFeaturesNormTemp.T ) # compute average distance between samples that belong to the cluster (a values) silA.append(numpy.mean(Yt) * clusterPerCent) silBs = [] for c2 in range( iSpeakers ): # compute distances from samples of other clusters if c2 != c: clusterPerCent2 = numpy.nonzero( cls == c2)[0].shape[0] / float(len(cls)) MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2] Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T) silBs.append( numpy.mean(Yt) * (clusterPerCent + clusterPerCent2) / 2.0) silBs = numpy.array(silBs) silB.append( min(silBs) ) # ... and keep the minimum value (i.e. the distance from the "nearest" cluster) silA = numpy.array(silA) silB = numpy.array(silB) sil = [] for c in range(iSpeakers): # for each cluster (speaker) sil.append((silB[c] - silA[c]) / (max(silB[c], silA[c]) + 0.00001)) # compute silhouette silAll.append(numpy.mean(sil)) # keep the AVERAGE SILLOUETTE #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5))) imax = numpy.argmax(silAll) # position of the maximum sillouette value nSpeakersFinal = sRange[imax] # optimal number of clusters # generate the final set of cluster labels # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window) cls = numpy.zeros((numOfWindows, )) for i in range(numOfWindows): j = numpy.argmin(numpy.abs(i - iNonOutLiers)) cls[i] = clsAll[imax][j] # Post-process method 1: hmm smoothing for i in range(1): startprob, transmat, means, cov = trainHMM_computeStatistics( MidTermFeaturesNormOr, cls) hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag") # hmm training hmm.startprob_ = startprob hmm.transmat_ = transmat hmm.means_ = means hmm.covars_ = cov cls = hmm.predict(MidTermFeaturesNormOr.T) # Post-process method 2: median filtering: cls = scipy.signal.medfilt(cls, 13) cls = scipy.signal.medfilt(cls, 11) sil = silAll[imax] # final sillouette classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)] # load ground-truth if available gtFile = fileName.replace('.wav', '.segments') # open for annotated file if os.path.isfile(gtFile): # if groundturh exists [segStart, segEnd, segLabels] = readSegmentGT(gtFile) # read GT data flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep) # convert to flags if PLOT: fig = plt.figure() if numOfSpeakers > 0: ax1 = fig.add_subplot(111) else: ax1 = fig.add_subplot(211) ax1.set_yticks(numpy.array(range(len(classNames)))) ax1.axis((0, Duration, -1, len(classNames))) ax1.set_yticklabels(classNames) ax1.plot(numpy.array(range(len(cls))) * mtStep + mtStep / 2.0, cls) if os.path.isfile(gtFile): if PLOT: ax1.plot( numpy.array(range(len(flagsGT))) * mtStep + mtStep / 2.0, flagsGT, 'r') purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization( cls, flagsGT) print "{0:.1f}\t{1:.1f}".format(100 * purityClusterMean, 100 * puritySpeakerMean) if PLOT: plt.title( "Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format( 100 * purityClusterMean, 100 * puritySpeakerMean)) if PLOT: plt.xlabel("time (seconds)") #print sRange, silAll if numOfSpeakers <= 0: plt.subplot(212) plt.plot(sRange, silAll) plt.xlabel("number of clusters") plt.ylabel("average clustering's sillouette") plt.show() return cls
'sparse_fct': 'global_sparse', 'smooth_param': 2, 'init': 'convex', 'neg_time': False, 'verbose': 0, 'maxcount': 50 #, #'sparse_param':sparseness } print("analysing files in {}".format(basepath)) for sparseness in np.arange(0.1, 1.000001, 0.1): ts_path = os.path.join( basepath, '_'.join([ "nnmf", str(config_dict['num_components']), "sm{}".format(config_dict['smooth_param']), config_dict['init'], 'sp{:02.0f}'.format(sparseness * 10), datafilename ])) decomposition = ia.TimeSeries() decomposition.load(ts_path) signal = ia.TrialMean()(ia.CutOut(response_window)(decomposition)) mode_cor = ia.CalcStimulusDrive()(signal) mask = mode_cor._series.squeeze() < 0.5 if np.sum(mask) > 1: #if there are stimulus driven components selected_modes = ia.SelectObjects()(decomposition, mask) cor = np.nanmax(1 - pdist(selected_modes.base._series, 'correlation')) else: cor = np.nanmax(1 - pdist(decomposition.base._series, 'correlation')) print("{}\t{}".format(sparseness, cor))
datax=dataText_features[Trmask,:] clusterslabels=[ClusterLabel[i] for i in dayindexes] GTclusterslabels=numpy.unique(clusterslabels,return_inverse=True,return_counts=True) dayTrlabels=numpy.asarray(ClusterLabel)[Trmask] #kmeans = KMeans(n_clusters=len(GTclusterslabels[0]), random_state=0).fit(datax) #db = DBSCAN(eps=2, min_samples=2).fit(datax) pred_y=[] for x1 in range(datax.shape[0]): if sum(datax[x1,:]==0)==datax.shape[1]: continue for x2 in range(x1+1,datax.shape[0]): if sum(datax[x2,:]==0)==datax.shape[1]: continue d = distance.pdist(numpy.vstack((datax[x1,:],datax[x2,:])), metric='cosine')[0] if numpy.isnan(d): print distance.pdist(numpy.vstack((datax[x1,:],datax[x2,:])), metric='cosine') print x1, x2 print datax.shape print Trainingindexes print sum(datax[x1,:]==0)#um(numpy.isnan(datax[x1,:])) break if dayTrlabels[x1]==dayTrlabels[x2]: outfile.append(str(d)+","+str(1)) else: outfile.append(str(d)+","+str(-1)) pred_y.append(d) dayLabelsList=[] for x1 in range(dayTrlabels.shape[0]):
def plot_heatmap( self, kind="final", min_freq=0.01, threshold=2, name=True, indirect=True, figsize=None, max_number_factors=5, aspect=1, cmap="RdBu_r", **kwargs, ): """Plot clustered heatmap of predicted motif activity. Parameters ---------- kind : str, optional Which data type to use for plotting. Default is 'final', which will plot the result of the rank aggregation. Other options are 'freq' for the motif frequencies, or any of the individual activities such as 'rf.score'. min_freq : float, optional Minimum frequency of motif occurrence. threshold : float, optional Minimum activity (absolute) of the rank aggregation result. name : bool, optional Use factor names instead of motif names for plotting. indirect : bool, optional Include indirect factors (computationally predicted or non-curated). Default is True. max_number_factors : int, optional Truncate the list of factors to this maximum size. figsize : tuple, optional Tuple of figure size (width, height). aspect : int, optional Aspect ratio for tweaking the plot. cmap : str, optional Color paletter to use, RdBu_r by default. kwargs : other keyword arguments All other keyword arguments are passed to sns.heatmap Returns ------- cg : ClusterGrid A seaborn ClusterGrid instance. """ filt = np.any(np.abs(self.result) >= threshold, 1) if hasattr(self, "freq"): filt = filt & np.any(np.abs(self.freq.T) >= min_freq, 1) else: filt = filt & (self.counts.sum() / self.counts.shape[0] > min_freq) idx = self.result.loc[filt].index if idx.shape[0] == 0: logger.warning("Empty matrix, try lowering the threshold") return if idx.shape[0] >= 100: logger.warning("The filtered matrix has more than 100 rows.") logger.warning( "It might be worthwhile to increase the threshold for visualization" ) if kind == "final": data = self.result elif kind == "freq": if hasattr(self, "freq"): data = self.freq.T cmap = "Reds" else: raise ValueError( "frequency plot only works with maelstrom output from clusters" ) elif kind in self.activity: data = self.activity[kind] if kind in ["hypergeom.count", "mwu.score"]: cmap = "Reds" else: raise ValueError("Unknown dtype") m = data.loc[idx] if "vmax" in kwargs: vmax = kwargs.pop("vmax") else: vmax = max(abs(np.percentile(m, 1)), np.percentile(m, 99)) if "vmin" in kwargs: vmin = kwargs.pop("vmin") else: vmin = -vmax if name: m["factors"] = [ self.motifs[n].format_factors( max_length=max_number_factors, html=False, include_indirect=indirect, extra_str=",..", ) for n in m.index ] m = m.set_index("factors") h, w = m.shape if figsize is None: figsize = (4 + m.shape[1] / 4, 1 + m.shape[0] / 3) fig = plt.figure(figsize=figsize) npixels = 30 g = GridSpec( 2, 1, height_ratios=(fig.get_figheight() * fig.dpi - npixels, npixels) ) ax1 = fig.add_subplot(g[0, :]) ax2 = fig.add_subplot(g[1, :]) ax2.set_title("aggregated z-score") dm = pdist(m, metric="correlation") hc = linkage(dm, method="ward") leaves = dendrogram(hc, no_plot=True)["leaves"] cg = sns.heatmap( m.iloc[leaves], ax=ax1, cbar_ax=ax2, cbar_kws={"orientation": "horizontal"}, cmap=cmap, linewidths=1, vmin=vmin, vmax=vmax, **kwargs, ) plt.setp(cg.axes.xaxis.get_majorticklabels(), rotation=90) plt.tight_layout() # cg.ax_col_dendrogram.set_visible(False) # plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) return cg
def visualize_maelstrom(outdir, sig_cutoff=3, pfmfile=None): config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) pfmfile = os.path.join(config.get_motif_dir(), pfmfile) mapfile = pfmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(mapfile): m2f = pd.read_csv( mapfile, sep="\t", names=["motif", "factors"], index_col=0, comment="#" ) m2f["factors"] = m2f["factors"].str[:50] else: motifs = [m.id for m in read_motifs(pfmfile)] m2f = pd.DataFrame({"factors": motifs}, index=motifs) sig_fname = os.path.join(outdir, "final.out.txt") df_sig = pd.read_table(sig_fname, index_col=0, comment="#") f = np.any(df_sig >= sig_cutoff, 1) vis = df_sig[f] if vis.shape[0] == 0: logger.info("No motifs reach the threshold, skipping visualization.\n") return # cluster rows row_linkage = hierarchy.linkage(pdist(vis, metric="euclidean"), method="complete") idx = hierarchy.leaves_list(row_linkage) plt.figure() vis = safe_join(vis, m2f).set_index("factors") # size of figure size = [2 + vis.shape[1] * 0.4, 1.8 + vis.shape[0] * 0.3] cg = sns.heatmap( vis.iloc[idx], cmap="viridis", yticklabels=True, cbar_kws={"orientation": "horizontal"}, ) _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Relevance") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.relevance.png"), dpi=300) freq_fname = os.path.join(outdir, "motif.freq.txt") if os.path.exists(freq_fname): df_freq = pd.read_table(freq_fname, index_col=0, comment="#") df_freq = df_freq.T vis_freq = df_freq.loc[vis.iloc[idx].index] vis_freq = safe_join(vis_freq, m2f).set_index("factors") plt.figure(figsize=size) cg = sns.heatmap( vis_freq, cmap="viridis", yticklabels=True, vmin=0, vmax=0.2, cbar_kws={"orientation": "horizontal"}, ) # idx = cg.dendrogram_row.reordered_ind _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Frequency") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.frequency.png"), dpi=300) plt.figure(figsize=size) bla = vis_freq.min(1) bla[bla < 0.01] = 0.01 cg = sns.heatmap( np.log2(vis_freq.apply(lambda x: x / bla, 0)), yticklabels=True, vmin=-5, vmax=5, cbar_kws={"orientation": "horizontal"}, ) # idx = cg.dendrogram_row.reordered_ind _ = plt.setp(cg.yaxis.get_majorticklabels(), rotation=0) plt.title("Motif Enrichment") plt.tight_layout() plt.savefig(os.path.join(outdir, "motif.enrichment.png"), dpi=300)
def vector_dispersion(vectors): distances = pdist(vectors, metric="cosine") dispersion = np.arccos(1.0 - distances.max()) return dispersion
def _execute_map(cls, ctx, op): from scipy.spatial.distance import pdist, cdist inputs, device_id, xp = as_same_device( [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True) if xp is cp: # pragma: no cover raise NotImplementedError( '`pdist` does not support running on GPU yet') with device(device_id): inputs_iter = iter(inputs) a = next(inputs_iter) if op.b is not None: b = next(inputs_iter) else: b = None kw = dict() if op.p is not None: kw['p'] = op.p if op.w is not None: kw['w'] = next(inputs_iter) if op.v is not None: kw['V'] = next(inputs_iter) if op.vi is not None: kw['VI'] = next(inputs_iter) metric = op.metric if op.metric is not None else op.metric_func if b is None: # one input, pdist on same chunk dists = pdist(a, metric=metric, **kw) i_indices, j_indices = xp.triu_indices(a.shape[0], k=1) i_indices += op.a_offset j_indices += op.a_offset else: # two inputs, pdist on different chunks dists = cdist(a, b, metric=metric, **kw).ravel() mgrid = \ xp.mgrid[op.a_offset: op.a_offset + a.shape[0], op.b_offset: op.b_offset + b.shape[0]] i_indices, j_indices = mgrid[0].ravel(), mgrid[1].ravel() out_row_sizes = xp.arange(op.n - 1, -1, -1) out_row_cum_sizes = xp.empty((op.n + 1, ), dtype=int) out_row_cum_sizes[0] = 0 xp.cumsum(out_row_sizes, out=out_row_cum_sizes[1:]) indices = out_row_cum_sizes[i_indices] + j_indices - \ (op.n - out_row_sizes[i_indices]) # save as much memory as possible del i_indices, j_indices, out_row_sizes, out_row_cum_sizes out_cum_size = xp.cumsum(op.out_sizes) out = op.outputs[0] for i in range(len(op.out_sizes)): start_index = out_cum_size[i - 1] if i > 0 else 0 end_index = out_cum_size[i] to_filter = (indices >= start_index) & (indices < end_index) downside_indices = indices[to_filter] - start_index downside_dists = dists[to_filter] ctx[out.key, str(i)] = (downside_indices, downside_dists)
def create(patterns): rdm = RDM() rdm.utv = pdist(patterns, 'correlation') rdm.square = squareform(rdm.utv) return rdm
u_cols = list(set([l.rsplit("_", 1)[0] for l in list(counts.columns)])) cols = list(counts.columns) ss = [] for uc in u_cols: cs = [c for c in cols if c.startswith(uc)] ss.append(counts[cs].sum(axis=1).rename(uc)) dc = pd.concat(ss, axis=1) return dc collapsed_counts = collapse_counts(counts) lut = dict(zip(list(set([c[:3] for c in collapsed_counts.columns])), "rbg")) row_colors = [lut[c[:3]] for c in collapsed_counts.columns] # legend_TN = [mpatches.Patch(color=c, label=l) for (list(set([c[:3] for c in collapsed_counts.columns]))] distances = pdist(collapsed_counts.T.values, metric='euclidean') dist_matrix = squareform(distances) dist_df = pd.DataFrame( dist_matrix, columns=collapsed_counts.columns, index=collapsed_counts.columns) sns.clustermap(dist_df) pairings_05hr = [['col_c_05h', 'col_w_05h'], ['lym_c_05h', 'lym_w_05h'], ['cer_c_05h', 'cer_w_05h']] pairings_6hr = [['col_c_6h', 'col_w_6h'], ['lym_c_6h', 'lym_w_6h'], ['cer_c_6h', 'cer_w_6h']] pairings_to_lym_05hr = [['col_c_05h', 'lym_w_05h'], ['col_w_05h', 'lym_w_05h'],
# @Time : 2020/3/4 16:06 # @Author : hqjiang # @File : distance_points.py import numpy as np from scipy.spatial.distance import pdist, squareform x = np.array([[0, 1], [1, 0], [2, 0]]) print(x) # x每个点与x 间的距离 d = squareform(pdist(x, 'euclidean')) # 欧几里得距离 print(d)