def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @datatype - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]: raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) if not self.missing_value is False: self.data = self.__return_missing_values__(self.data, self.missing_values_indexes) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original or self.datatype == "nominal": self.data = self.original_data
def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): """Performs clustering according to the given parameters. @data_type - numeric/binary @row_distance/column_distance - see. DISTANCES variable @row_linkage/column_linkage - see. LINKAGES variable @axis - row/both """ print("Clustering rows:", row_distance, row_linkage) self.data_type = data_type self.clustering_axis = axis row_linkage = str(row_linkage) if row_linkage in RAW_LINKAGES: self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) else: self.distance_vector = fastcluster.pdist(self.data, row_distance) if data_type in DISTANCES and not row_distance in DISTANCES[data_type]: raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])])) elif not data_type in DISTANCES.keys(): raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())])) self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) self.column_clustering = [] if axis == "both" and len(self.data[0]) > 2: print("Clustering columns:", column_distance, column_linkage) self.__cluster_columns__(column_distance, column_linkage) if self.write_original: self.data = self.original_data return
def hierarchical( self, nclusters, linkage_method, noise=False, ): if noise: matrix = self.distance_matrix.add_noise() else: matrix = self.distance_matrix linkmat = linkage(squareform(matrix), linkage_method) linkmat_size = len(linkmat) if nclusters <= 1: br_top = linkmat[linkmat_size - nclusters][2] else: br_top = linkmat[linkmat_size - nclusters + 1][2] if nclusters >= len(linkmat): br_bottom = 0 else: br_bottom = linkmat[linkmat_size - nclusters][2] threshold = 0.5 * (br_top + br_bottom) T = fcluster(linkmat, threshold, criterion='distance') return Partition(T)
def saveLinkage(distanceMatrix): # link = linkage(distanceMatrix, 'ward') link = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix afile = open(structfolder + 'wardlinkage.pkl', 'wb') pickle.dump(link, afile); afile.close(); return link
def test_basic_clustering(self): data = [ [1.0, 2.0], [2.0, 1.0], [2.1, 1.1], [2, 1.1], [1.0, 2.1], ] data = np.array(data) dist = fastcluster.pdist(data) result = fastcluster.linkage(dist).tolist() assert_that(int(result[0][0])).is_equal_to(0) assert_that(int(result[0][1])).is_equal_to(4) assert_that(result[0][2]).is_close_to(0.1, 0.00001) assert_that(int(result[0][3])).is_equal_to(2) assert_that(int(result[1][0])).is_equal_to(1) assert_that(int(result[1][1])).is_equal_to(3) assert_that(result[1][2]).is_close_to(0.1, 0.00001) assert_that(int(result[1][3])).is_equal_to(2) assert_that(int(result[2][0])).is_equal_to(2) assert_that(int(result[2][1])).is_equal_to(6) assert_that(result[2][2]).is_close_to(0.1, 0.00001) assert_that(int(result[2][3])).is_equal_to(3) assert_that(int(result[3][0])).is_equal_to(5) assert_that(int(result[3][1])).is_equal_to(7) assert_that(result[3][2]).is_close_to(1.34536, 0.00001) assert_that(int(result[3][3])).is_equal_to(5)
def run_entity_model(cdev, cprc): print '____________________________________________________' print 'running entity model' hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc) print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left' voc = build_voc(entcorp, 2) ent_vectorizer = CountVectorizer(vocabulary = voc) E = ent_vectorizer.fit_transform(hdev) Eclean, emapping = filter_rare(E, 0) E_dense = np.matrix(Eclean).astype('float') E_scaled = preprocessing.scale(E_dense) E_normalized = preprocessing.normalize(E_scaled, norm='l2') EMatrix = pairwise_distances(E_normalized, metric='cosine') EL = fastcluster.linkage(EMatrix, method='average') flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance') ec = organize_clusters(flat_eclust, th = 3) ecf = [] for cl in ec: ecf.append([hmapping[emapping[t]] for t in cl]) print 'detected', len(ecf), 'entity clusters' return ecf, voc
def run_ngram_model(cdev, cprc): print '____________________________________________________' print 'running n-gram model' wcorp = [] for i in cprc: wcorp.append(' '.join(cprc[i]['words'])) vectorizer = CountVectorizer(analyzer='word', binary=True, min_df=max(int(len(wcorp)*0.0005), 5), ngram_range=(2,3)) X = vectorizer.fit_transform(wcorp) Xclean, mapping = filter_rare(X) Xdense = np.matrix(Xclean).astype('float') X_scaled = preprocessing.scale(Xdense) X_normalized = preprocessing.normalize(X_scaled, norm='l2') textMatrix = pairwise_distances(X_normalized, metric='cosine') L = fastcluster.linkage(textMatrix, method='average') flat_textclust = hierarchy.fcluster(L, 0.5, 'distance') ttc = organize_clusters(flat_textclust) ncf = [] for cl in ttc: ncf.append([mapping[t] for t in cl]) print 'detected', len(ncf), 'n-gram clusters' return ncf
def cluster(self): # We cluster for each argument independently! retval = ClusterResult() curOffset = 0 argNum = 0 for symbolsForArg in self.contentProvider.getSourceAPISymbols(): D = self._calculateDistanceMatrix(symbolsForArg) curOffset = len(retval.clusterIdToDatapoint.keys()) if len(symbolsForArg) == 0: argNum += 1 continue if len(symbolsForArg) == 1: retval.register(curOffset, symbolsForArg[0], argNum) argNum += 1 continue Z = linkage(D, method=self.linkageMethod) clustering = fcluster(Z, self.maxDistInCluster, criterion = 'distance') retval.registerSet(symbolsForArg, clustering, curOffset, argNum) argNum += 1 return retval
def hclust(self): link_file = self.datafile + '.link.npy' if os.path.isfile(link_file) and os.path.getmtime(link_file) >= os.path.getmtime(self.datafile): self.link_matrix = np.load(link_file) else: blast_score = self.normalized.as_matrix() self.link_matrix = fastcluster.linkage(blast_score, method='average', metric='correlation', preserve_input=False) del blast_score np.save(link_file, self.link_matrix) self.gene_num = self.normalized.shape[0] self.node_num = self.gene_num + self.link_matrix.shape[0] self.parent_tree = np.array(np.arange(self.node_num)) self.leaf_num = np.array([1] * self.gene_num + [0] * (self.node_num - self.gene_num)) for i in range(self.link_matrix.shape[0]): assert(self.parent_tree[self.link_matrix[i, 0]] == int(self.link_matrix[i, 0])) assert(self.parent_tree[self.link_matrix[i, 1]] == int(self.link_matrix[i, 1])) assert(self.leaf_num[self.gene_num + i] == 0) self.parent_tree[self.link_matrix[i, 0]] = self.gene_num + i self.parent_tree[self.link_matrix[i, 1]] = self.gene_num + i self.leaf_num[i + self.gene_num] = self.leaf_num[self.link_matrix[i, 0]] + \ self.leaf_num[self.link_matrix[i, 1]]
def training_predict(X, K): """ Get unique masks and cluster indices on the training set. Parameters ---------- X : (N, F) ndarray of boolean Returns ------- umasks : (UK, F) ndarray of bool cluster_ind : (N,) ndarray of int Each cluster ind is [0, K'), with K' <= K, or [0, UK) if K == -1 or K >= UK. """ umasks = tc.mask_distribution.get_unique_masks(X) UK = umasks.shape[0] if K < 0 or K >= UK: cluster_ind = np.zeros(X.shape[0], dtype=int) for i in range(1, UK): cluster_ind[(X == umasks[i]).all(1)] = i else: Z = fastcluster.linkage(X, method="single", metric="hamming") cluster_ind = fcluster(Z, K, criterion="maxclust") - 1 return umasks, cluster_ind
def writeClusters(results): threshold = 0.9 results = numpy.fromiter(results, dtype=[('pairs', 'i8', 2), ('score', 'f4', 1,)]) i_to_id, condensed_distances, N = condensedDistance(results) linkages = fastcluster.linkage(condensed_distances, method='ward') partition = hcluster.fcluster(linkages, threshold, criterion='inconsistent') clusters = {} for (i, cluster_id) in enumerate(partition): clusters.setdefault(cluster_id, []).append(i_to_id[i]) i = 0 for cluster in clusters.values(): images = [] for index in cluster: image_name = all_images[index] image_path = os.path.join(imagedir, image_name) cluster_path = 'clustered_images/{0}'.format(str(i)) # There must be a better way to do this try: os.mkdir(cluster_path) except OSError: for f in os.listdir(cluster_path): try: os.remove(f) except OSError: pass print('writing %s' % image_name) with open(image_path, 'rb') as inp: with open(os.path.join('clustered_images', str(i), image_name), 'wb') as outp: outp.write(inp.read()) i += 1
def test_all(): D2 = D.copy() for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: Z2 = fc.linkage(D, method) if np.any(D2!=D): raise AssertionError('Input array was corrupted.') test(Z2, method)
def hist_per_stagione(start=1992, end=2012): stagione=(all_labels > start) & (all_labels < end) dist_selected=dist[ix_(stagione,stagione)] Z=linkage(squareform(dist_selected),method='complete') n=choose_p(Z) c=fcluster(Z,n,criterion='maxclust')-1 label_anni=all_labels[stagione] #order by first appearance! first_appearance=[] for i in range(0,n): first_appearance.append(min(label_anni[c==i])) order1=[index for key,index in sorted(zip(first_appearance,range(0,n)))] order2=[index for key,index in sorted(zip(order1,range(0,n)))] order=array(order2) c=order[c] #draw scatter plot scatter(label_anni,c,s=100,c=c) #grid(b=True,axis='y') yticks(range(0,n+1)) xlim((min(label_anni)-0.5,max(label_anni)+0.5)) ax=gca() for i in range(1993,2011+1): ax.add_line(Line2D([i+7./12,i+7./12],[0,n+1],linestyle='--')) show()
def hierarchical_clust(d, cluster_method='average'): if VERBOSE: print 'Doing hierarchical clustering using fastcluster!' # some might say this function is redundant # d should be a distance vector Z = fastcluster.linkage(d, method=cluster_method) return Z
def clusterAndDendogrgam(Metrix): out = fastcluster.linkage(Metrix, method="single", metric="euclidean", preserve_input=True) plt.plot() dend = augmented_dendrogram( out, p=30, truncate_mode=None, color_threshold=None, get_leaves=True, orientation="top", labels=None, count_sort=False, distance_sort=False, show_leaf_counts=True, no_plot=False, no_labels=False, color_list=None, leaf_font_size=None, leaf_rotation=None, leaf_label_func=None, no_leaves=False, show_contracted=False, link_color_func=None, ) plt.show() return out
def clusterHeatmap(df, title, row_label_map, col_label_map, colormap=my_cmap, cluster_rows=False, cluster_columns=False, cluster_data=None, row_dendrogram=False, column_dendrogram=False, width=30, height=20, vmin=-3, vmax=3, distmethod="correlation", colorbar=True, colorbar_shrink=0.2, label_values=False): cm = pylab.get_cmap(colormap) cm.set_bad("0.9") # do clustering if cluster_data is None: cluster_data = df # cluster the same data that we are plotting matplotlib.rcParams['figure.figsize'] = [width, height] # pylab.figsize(20, 10) pylab.title(title) # pylab.text(0,-5,str(datetime.date.today())) # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]] # xlabels = pt.axes[1][cZ['leaves']] orderedVal = df if cluster_rows: distances = scipy.cluster.hierarchy.distance.pdist(cluster_data.values, distmethod) rowY = fastcluster.linkage(distances) rowZ = scipy.cluster.hierarchy.dendrogram(rowY, orientation='right', no_plot=True) orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']]) if cluster_columns: coldist = scipy.cluster.hierarchy.distance.pdist(df.values.transpose(), distmethod) cY = scipy.cluster.hierarchy.linkage(coldist) cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True) orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']]) # row labels if row_label_map is not None: pylab.yticks(range(0, len(orderedVal.index)), [row_label_map[i] for i in orderedVal.index]) else: pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index) pylab.xticks(range(0, len(orderedVal.columns)), orderedVal.columns, rotation=90) if col_label_map is not None: pylab.xticks(range(0, len(orderedVal.columns)), [col_label_map[i] for i in orderedVal.columns]) if label_values: cmatrix = orderedVal.as_matrix() for x in range(cmatrix.shape[0]): for y in range(cmatrix.shape[1]): if cmatrix[x, y] >= 0: pylab.text(y, x, "%.1f" % cmatrix[x,y], horizontalalignment='center', verticalalignment='center') #orderedVal = orderedVal[:,] pylab.tick_params(direction="out") pylab.imshow(orderedVal, interpolation="nearest", cmap=cm, aspect='auto', norm=None, vmin=vmin, vmax=vmax) if colorbar: pylab.colorbar(shrink=colorbar_shrink)
def test_fastcluster_other_method(self): import fastcluster kws = self.default_kws.copy() kws["method"] = "average" linkage = fastcluster.linkage(self.x_norm.T, method="average", metric="euclidean") p = mat._DendrogramPlotter(self.x_norm, **kws) npt.assert_array_equal(p.linkage, linkage)
def heatmap_cor( x, vec, minval, maxval ): # Compute and plots heatmap & dendrogram. norm,corr,dist=analyse.all_corr(vec) print 'statrting to cluster...' fig = plt.figure(figsize=(8,8)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) z=fastcluster.linkage(dist, method='complete') #z=fastcluster.linkage(norm,metric='euclidean', method='ward') print 'clustering done, drawing the dendogram' Z1 = sch.dendrogram(z, labels=x,orientation='right') del norm del dist plt.yticks(fontsize=8) #ax1.set_yticks([]) ticks = ax1.get_xticks() #/ max(ax1.get_xticks()) ticks=map(float,ticks) ticks = ['%.2f' % (a/2.) for a in ticks] ax1.set_xticklabels(ticks) # Plot distance matrix. axmatrix = fig.add_axes([0.4,0.1,0.5,0.6]) axmatrix.set_xticks([]) axmatrix.set_yticks([]) axmatrix.xaxis.tick_top() axmatrix.set_frame_on(False) idx1 = Z1['leaves'] idx2 = Z1['leaves'] xx=[] for i in idx1: xx.append(x[int(i)]) D = corr[idx1,:] D = D[:,idx2] print 'heatmap' im = axmatrix.pcolor(D, cmap=plt.cm.RdYlBu,edgecolor='k',) plt.xticks(fontsize=5) plt.yticks([]) xx=[] for i in idx1: xx.append(x[int(i)]) #plt.yticks(np.arange(len(x)),xx,fontsize = 12) plt.xticks(np.arange(len(x)),xx) plt.xticks(rotation=90) plt.xticks(fontsize=8) axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) plt.colorbar(im, cax=axcolor) fig.show() return xx
def __cluster_columns__(self, column_distance, column_linkage): columns = zip(*self.data) self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance) self.data_order = hcluster.leaves_list(self.column_clustering) self.data = self.__reorder_data__(self.data, self.data_order) self.original_data = self.__reorder_data__(self.original_data, self.data_order) if self.header: self.header = self.__reorder_data__([self.header], self.data_order)[0] return
def test(): n = np.random.randint(2,100) # Part 1: distance matrix input N = n*(n-1)//2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.randint(2,13) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass return True
def __init__(self, metric, trajectories, method='single', precomputed_values=None): """Initialize a hierarchical clusterer using the supplied distance metric and method. Method should be one of the fastcluster linkage methods, namely 'single', 'complete', 'average', 'weighted', 'centroid', 'median', or 'ward'. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory list of Trajectorys data to cluster method : {'single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward'} precomputed_values : used internally to implement load_from_disk() Notes ----- This is implemenred with the fastcluster library, which can be downloaded from CRAN http://cran.r-project.org/web/packages/fastcluster/ """ if precomputed_values is not None: precomputed_z_matrix, traj_lengths = precomputed_values if isinstance(precomputed_z_matrix, np.ndarray) and precomputed_z_matrix.shape[1] == 4: self.Z = precomputed_z_matrix self.traj_lengths = traj_lengths return else: raise Exception('Something is wrong') if not isinstance(metric, metrics.AbstractDistanceMetric): raise TypeError('%s is not an abstract distance metric' % metric) if not method in self.allowable_methods: raise ValueError("%s not in %s" % (method, str(self.allowable_methods))) if isinstance(trajectories, md.Trajectory): trajectories = [trajectories] elif isinstance(trajectories, types.GeneratorType): trajectories = list(trajectories) self.traj_lengths = np.array([len(t) for t in trajectories]) # self.ptrajs = [self.metric.prepare_trajectory(traj) for traj in self.trajectories] logger.info('Preparing...') flat_trajectory = concatenate_trajectories(trajectories) pflat_trajectory = metric.prepare_trajectory(flat_trajectory) logger.info('Getting all to all pairwise distance matrix...') dmat = metric.all_pairwise(pflat_trajectory) logger.info('Done with all2all') self.Z = fastcluster.linkage(dmat, method=method, preserve_input=False) logger.info('Got Z matrix')
def cluster(data_vecs, method='average', metric='cosine', save=True): print "Calculating the linkage matrix, metric = {0}, method = {1}".format(metric, method) links = fc.linkage(data_vecs, metric=metric,method=method) # if save: # print "Saving the model to: results/" + filename + "/linkage" # file = open('results/' + filename + '_linkage', 'wb') # pickle.dump(links, file) # file.close() return links
def test_fastcluster_non_euclidean(self): import fastcluster kws = self.default_kws.copy() kws['metric'] = 'cosine' kws['method'] = 'average' linkage = fastcluster.linkage(self.x_norm.T, method=kws['method'], metric=kws['metric']) p = mat._DendrogramPlotter(self.x_norm, **kws) npt.assert_array_equal(p.linkage, linkage)
def cluster(self, cluster_count = None, cluster_radius = 10.0): x = self.x nx = x.shape[0] D=pdist(x) l = fc.linkage(D,'single') l0 = numpy.hstack((x,x, numpy.zeros((nx,1)), numpy.ones((nx,1)))) self._ct = ClusterTree(l0, l) self._ct.find_groups(cluster_radius) self._ct.sort_groups()
def buildClusters(featMatrix): distanceMatrix = pdist(featMatrix[:,:], metric='jaccard') pickleSave(structsfolder + 'distanceMatrix.pkl', distanceMatrix) # ed = euclidean_distances(featMatrix[1:100,:], featMatrix[1:100,:]) linkage = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix # fc = fcluster(link, 30, criterion='maxclust') #R = dendrogram(link, color_threshold=0.3, leaf_font_size=6) #pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/featMatrix.png" ) distanceMatrix = None pickleSave(structsfolder + 'linkage.pkl', linkage)
def perform_clustering(self, kwargs): """ Performs the hierarchical clustering step and the clustering step. If the hierarchical matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm a second time it will use the last matrix. """ """ Gets a condensed matrix and calculates the clustering. One can use diverse methodologies to do this clustering... With preserve_input=False the matrix is destroyed while clustering, ut it saves memory. The metric is not needed in this case,as we are giving the function the calculated matrix. The method is the method used to determine distances when fusing clusters. methods are described in: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html """ try: cutoff = kwargs["cutoff"] except KeyError: cutoff = None try: hie_mat = kwargs["hie_mat"] except KeyError: hie_mat = None try: method = kwargs["method"] except KeyError: method = 'complete' if hie_mat != None: self.hie_mat = hie_mat # print "[HIERARCHICAL] Matrix provided." else: if self.hie_mat == None: #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False) # print "[HIERARCHICAL] Calculating Matrix" #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method) self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method) # else: # print "[HIERARCHICAL] Matrix was already stored" algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")" if cutoff != None: # Then apply the cutoff, this doesn't work much as expected # print "[HIERARCHICAL] getting clustering."+algorithm_details group_list = hcluster.fcluster(self.hie_mat,cutoff) # print "[HIERARCHICAL] Clustering done."+algorithm_details # Then let's generate the clusters clusters = gen_clusters_from_class_list(group_list) return Clustering(clusters,details = algorithm_details) else: return None
def complete_linkage(dm): """ Perform complete linkage hierarchical clustering on a distance matrix. Args: dm (numpy.array): Distance matrix Returns: (object): fastcluster complete linkage hierarchical clustering object """ return linkage(dm, 'complete')
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200, method='ap'): """Define clusters given the similarity matrix and the threshold.""" n, labels = connected_components(similarity_matrix, directed=False) prev_max_clust = 0 print("connected components: %d" % n) clusters = labels.copy() if method == 'dbscan': ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1) if method == 'ap': ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter, preference='median') for i in range(n): idxs = np.where(labels == i)[0] if idxs.shape[0] > 1: sm = similarity_matrix[idxs][:, idxs] sm += sm.T + scipy.sparse.eye(sm.shape[0]) # Hierarchical clustering if method == 'hc': dists = squareform(1 - sm.toarray()) links = fastcluster.linkage(dists, method='ward') try: clusters_ = fcluster(links, threshold, 'distance') except ValueError as err: logging.critical(err) clusters_ = np.zeros(1, dtype=int) # DBSCAN elif method == 'dbscan': db = ap.fit(1. - sm.toarray()) # Number of clusters in labels, ignoring noise if present. clusters_ = db.labels_ # n_clusters_ = len(set(clusters_)) - int(0 in clusters_) # AffinityPropagation # ap = AffinityPropagation(affinity='precomputed') elif method == 'ap': db = ap.fit(sm) clusters_ = db.labels_ else: raise ValueError("clustering method %s unknown" % method) if np.min(clusters_) == 0: clusters_ += 1 clusters_ += prev_max_clust clusters[idxs] = clusters_ prev_max_clust = max(clusters_) else: # connected component contains just 1 element prev_max_clust += 1 clusters[idxs] = prev_max_clust return np.array(extra.flatten(clusters))
def cluster(dupes, threshold=.5, max_components=30000): ''' Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold Keyword arguments: threshold -- number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall ''' threshold = 1 - threshold dupe_sub_graphs = connected_components(dupes, max_components) clustering = {} cluster_id = 0 for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: (i_to_id, condensed_distances) = condensedDistance(sub_graph) N = max(i_to_id) + 1 linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=False) partition = hcluster.fcluster(linkage, threshold, criterion='distance') clusters = {} for (i, sub_cluster_id) in enumerate(partition): clusters.setdefault(cluster_id + sub_cluster_id, []).append(i) cophenetic_distances = hcluster.cophenet(linkage) for cluster_id, items in clusters.iteritems() : if len(items) > 1 : score = clusterConfidence(items, cophenetic_distances, N) clustering[cluster_id] = (tuple(i_to_id[item] for item in items), 1 - score) cluster_id += max(partition) + 1 else: ids, score = sub_graph[0] clustering[cluster_id] = tuple(ids), score cluster_id += 1 return clustering.values()
def _hclust(self, nclusters, method, noise=False): """ :param nclusters: Number of clusters to return :param linkage_method: single, complete, average, ward, weighted, centroid or median (http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html) :param noise: Add Gaussian noise to the distance matrix prior to clustering (bool, default=False) :return: Partition object describing clustering """ matrix = self.get_dm(noise) linkmat = fastcluster.linkage(squareform(matrix), method) return _hclust(linkmat, nclusters)
from machinelearning import datasetselection, featureselection import machinelearning.dataclasses as dc import pickle from operator import itemgetter import matplotlib.pyplot as plt import scipy.cluster.hierarchy as hierarchy import scipy.spatial.distance as distance import fastcluster # load distance matrix with open('gene_gene_matrix_euclidean_distance_from_projection.pickle', 'rb') as fr: gene_gene = pickle.load(fr) # prefer ward linkage for euclidean distance or at least this case lnk = fastcluster.linkage(distance.squareform(gene_gene.matrix, checks=False), 'ward') si = hierarchy.leaves_list(lnk).astype('int64') # load projection with open('gene_atb_matrix_2d_dnn_projection.pickle', 'rb') as fr: gene_proj = pickle.load(fr) if ~(gene_proj.rowlabels == gene_gene.rowlabels).all(): raise ValueError('genes not aligned') gene_proj.reorder(si, 0) ordered_genes = gene_proj.rowlabels.copy() del gene_gene, lnk, si # select datasets dataset_info = datasetselection.finddatasets(getalllevels=True) included_datasetabbrevs = { 'clinvar', 'dbgap_cleaned', 'gad', 'gadhighlevel_cleaned', 'gobp', 'gocc',
def diff_exp_clusters(cluster_expression_df, cluster_sizes, file_format): n_clusters = len(cluster_sizes) cluster_sum_umi = np.vstack([ cluster_sizes[c] * cluster_expression_df[f'Cluster {c} mean UMI'].values for c in range(n_clusters) ]) cluster_ssq_umi = np.vstack([ cluster_sizes[c] * (cluster_expression_df[f'Cluster {c} std UMI'].values**2 + cluster_expression_df[f'Cluster {c} mean UMI'].values**2) for c in range(n_clusters) ]) Z = fastcluster.linkage(cluster_sum_umi, method='average', metric='cosine') fig = matplotlib.figure.Figure(figsize=(12, 12)) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) scipy.cluster.hierarchy.dendrogram(Z, ax=ax, color_threshold=0, above_threshold_color='grey') ax.set_title('Hierarchical structure of cell-type clusters') ax.set_xlabel('Cluster Label') ax.tick_params(labelleft='off') FigureCanvasAgg(fig).print_figure(file_format.format('dendrogram', 'png')) root, rd = scipy.cluster.hierarchy.to_tree(Z, rd=True) def de(lbl_1, lbl_2, group1, group2): print(f'Comparing {group1} to {group2}') group1_n_cells = sum(cluster_sizes[c] for c in group1) group2_n_cells = sum(cluster_sizes[c] for c in group2) group1_mean = cluster_sum_umi[group1, :].sum(axis=0) / group1_n_cells group2_mean = cluster_sum_umi[group2, :].sum(axis=0) / group2_n_cells mean_diff = group1_mean - group2_mean group1_var = (cluster_ssq_umi[group1, :].sum(axis=0) / group1_n_cells - group1_mean**2) group2_var = (cluster_ssq_umi[group2, :].sum(axis=0) / group2_n_cells - group2_mean**2) pooled_sd = np.sqrt(group1_var / group1_n_cells + group2_var / group2_n_cells) z_scores = np.zeros_like(pooled_sd) nz = pooled_sd > 0 z_scores[nz] = np.nan_to_num(mean_diff[nz] / pooled_sd[nz]) # t-test p_vals = np.clip( (1 - stats.norm.cdf(np.abs(z_scores))) * 2 * z_scores.shape[0], 0, 1) df = pd.DataFrame(OrderedDict([('z', z_scores), ('p', p_vals), ('group1', group1_mean), ('group2', group2_mean)]), index=cluster_expression_df.index) df = df[df['p'] < 0.001] df['diff'] = df['group1'] - df['group2'] df.sort_values('diff', ascending=False, inplace=True) name = f'differential_gene_expression_{lbl_1}_v_{lbl_2}' df.to_csv(file_format.format(name, 'csv')) for i in range(0, 2 * n_clusters - 1): if i >= n_clusters: left_child = rd[i].get_left() left_clusters = (left_child.pre_order(lambda x: x.id)) right_child = rd[i].get_right() right_clusters = (right_child.pre_order(lambda x: x.id)) # don't calculate if it's redundant with a 1-vs-all comp if i == 2 * n_clusters - 2 and (len(left_clusters) == 1 or len(right_clusters) == 1): continue de(left_child.id, right_child.id, left_clusters, right_clusters) if i < 2 * n_clusters - 2: below = rd[i].pre_order(lambda x: x.id) above = [j for j in range(len(cluster_sizes)) if j not in below] # don't calculate redundant comparison if len(above) == 1: continue de(i, 'all', below, above) group_list = [(i, rd[i].pre_order(lambda x: x.id)) for i in range(0, 2 * n_clusters - 1)] group_list[-1] = ('total', group_list[-1][1]) return group_list
def test_adjusted_rand_performance(self): # Arrange n = 100 np.random.seed(seed = 8455624) x = np.random.normal(n, 2, (n, 2)) A = linkage(x, 'centroid') B = linkage(x, 'ward') # Act similarity_times = [] sklearn_times = [] fcluster_times = [] for repitition in range(100): start = perf_counter() metrics = similarity_metrics(A, B) ar_similarity = metrics.adjusted_rand() end = perf_counter() similarity_times.append(end-start) ar_sklearn = [] sklearn_time = 0 fcluster_time = 0 excluded_results = 0 for i in range(n - 1, 1, -1): start = perf_counter() fcluster_a = fcluster(A, i, 'maxclust') fcluster_b = fcluster(B, i, 'maxclust') end = perf_counter() fcluster_time += (end - start) start = perf_counter() ar = adjusted_rand_score(fcluster_a, fcluster_b) end = perf_counter() sklearn_time += (end - start) # fcluster takes maxclust rather than an exact number of clusters # most of the time it will create exactly maxclust, but for the occassions # that it doesn't the results are are not comparable so ignore them if (len(np.unique(fcluster_a)) != i) or (len(np.unique(fcluster_b)) != i): excluded_results += 1 ar_sklearn.append(ar_similarity[len(ar_sklearn)]) else: ar_sklearn.append(ar) sklearn_times.append(sklearn_time) fcluster_times.append(fcluster_time) ar_sklearn = np.array(ar_sklearn) idx = ar_sklearn != np.nan # Assert self.assertEqual(len(ar_sklearn), len(ar_similarity)) assert_almost_equal(ar_similarity, ar_sklearn) self.assertEqual(4, excluded_results) # double-check that we haven't excluded everything print("\nSimilarity average time: ", np.average(similarity_times)) print("\nSklearn average time: ", np.average(sklearn_times)) print("\nFCluster average time: ", np.average(fcluster_times))
plt.title('Image show of the correlation lattice - longitudinal') plt.figure() plt.imshow(dfC[0, 1::2, :]) plt.colorbar() plt.title('Image show of the correlation lattice - lattitudinal') # plt.figure() # plt.imshow(y) # plt.title('Distance matrix') # render_component_single(gfC.d[0, :, :], gfC.lats, gfC.lons, False, None, "Neighbor correlation") print("Clustering ...") plt.figure() Z = fastcluster.linkage(ytri, method = 'single') print("Plotting dendrogram ...") dendrogram(Z, 7, 'level') max_d = np.amax(Z[:,2]) print("Maximum distance is %g" % max_d) my_d = max_d / 2 cont = True while cont: f = fcluster(Z, my_d, 'distance') print f.shape, my_d if np.amax(f) > 30: my_d = (max_d + my_d) * 0.5 elif np.amax(f) < 10: my_d = my_d - (max_d - my_d) / max_d else:
def clusterHeatmap(df, title, row_label_map, col_label_map, colormap=my_cmap, cluster_rows=False, cluster_columns=False, cluster_data=None, row_dendrogram=False, column_dendrogram=False, width=30, height=20, vmin=-3, vmax=3, distmethod="correlation", colorbar=True, colorbar_shrink=0.2, label_values=False): cm = pylab.get_cmap(colormap) cm.set_bad("0.9") # do clustering if cluster_data is None: cluster_data = df # cluster the same data that we are plotting matplotlib.rcParams['figure.figsize'] = [width, height] # pylab.figsize(20, 10) pylab.title(title) # pylab.text(0,-5,str(datetime.date.today())) # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]] # xlabels = pt.axes[1][cZ['leaves']] orderedVal = df if cluster_rows: distances = scipy.cluster.hierarchy.distance.pdist( cluster_data.values, distmethod) rowY = fastcluster.linkage(distances) rowZ = scipy.cluster.hierarchy.dendrogram(rowY, orientation='right', no_plot=True) orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']]) if cluster_columns: coldist = scipy.cluster.hierarchy.distance.pdist( df.values.transpose(), distmethod) cY = scipy.cluster.hierarchy.linkage(coldist) cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True) orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']]) # row labels if row_label_map is not None: pylab.yticks(range(0, len(orderedVal.index)), [row_label_map[i] for i in orderedVal.index]) else: pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index) pylab.xticks(range(0, len(orderedVal.columns)), orderedVal.columns, rotation=90) if col_label_map is not None: pylab.xticks(range(0, len(orderedVal.columns)), [col_label_map[i] for i in orderedVal.columns]) if label_values: cmatrix = orderedVal.as_matrix() for x in range(cmatrix.shape[0]): for y in range(cmatrix.shape[1]): if cmatrix[x, y] >= 0: pylab.text(y, x, "%.1f" % cmatrix[x, y], horizontalalignment='center', verticalalignment='center') #orderedVal = orderedVal[:,] pylab.tick_params(direction="out") pylab.imshow(orderedVal, interpolation="nearest", cmap=cm, aspect='auto', norm=None, vmin=vmin, vmax=vmax) if colorbar: pylab.colorbar(shrink=colorbar_shrink)
def consensus(self, k, density_threshold_str='0.5', local_neighborhood_size=0.30, show_clustering=False, skip_density_and_return_after_stats=False, close_clustergram_fig=True): merged_spectra = load_df_from_npz(self.paths['merged_spectra'] % k) norm_counts = sc.read(self.paths['normalized_counts']) if skip_density_and_return_after_stats: density_threshold_str = '2' density_threshold_repl = density_threshold_str.replace('.', '_') density_threshold = float(density_threshold_str) n_neighbors = int(local_neighborhood_size * merged_spectra.shape[0] / k) # Rescale topics such to length of 1. l2_spectra = (merged_spectra.T / np.sqrt( (merged_spectra**2).sum(axis=1))).T if not skip_density_and_return_after_stats: # Compute the local density matrix (if not previously cached) topics_dist = None if os.path.isfile(self.paths['local_density_cache'] % k): local_density = load_df_from_npz( self.paths['local_density_cache'] % k) else: # first find the full distance matrix topics_dist = squareform(fast_euclidean(l2_spectra.values)) # partition based on the first n neighbors partitioning_order = np.argpartition(topics_dist, n_neighbors + 1)[:, :n_neighbors + 1] # find the mean over those n_neighbors (excluding self, which has a distance of 0) distance_to_nearest_neighbors = topics_dist[ np.arange(topics_dist.shape[0])[:, None], partitioning_order] local_density = pd.DataFrame( distance_to_nearest_neighbors.sum(1) / (n_neighbors), columns=['local_density'], index=l2_spectra.index) save_df_to_npz(local_density, self.paths['local_density_cache'] % k) del (partitioning_order) del (distance_to_nearest_neighbors) density_filter = local_density.iloc[:, 0] < density_threshold l2_spectra = l2_spectra.loc[density_filter, :] kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=1) kmeans_model.fit(l2_spectra) kmeans_cluster_labels = pd.Series(kmeans_model.labels_ + 1, index=l2_spectra.index) # Find median usage for each gene across cluster median_spectra = l2_spectra.groupby(kmeans_cluster_labels).median() # Normalize median spectra to probability distributions. median_spectra = (median_spectra.T / median_spectra.sum(1)).T # Compute the silhouette score stability = silhouette_score(l2_spectra.values, kmeans_cluster_labels, metric='euclidean') # Obtain the reconstructed count matrix by re-fitting the usage matrix and computing the dot product: usage.dot(spectra) refit_nmf_kwargs = yaml.load(open(self.paths['nmf_run_parameters']), Loader=yaml.FullLoader) refit_nmf_kwargs.update( dict(n_components=k, H=median_spectra.values, update_H=False)) _, rf_usages = self._nmf(norm_counts.X, nmf_kwargs=refit_nmf_kwargs) rf_usages = pd.DataFrame(rf_usages, index=norm_counts.obs.index, columns=median_spectra.index) rf_pred_norm_counts = rf_usages.dot(median_spectra) # Compute prediction error as a frobenius norm if sp.issparse(norm_counts.X): prediction_error = ((norm_counts.X.todense() - rf_pred_norm_counts)**2).sum().sum() else: prediction_error = ((norm_counts.X - rf_pred_norm_counts)**2).sum().sum() consensus_stats = pd.DataFrame( [k, density_threshold, stability, prediction_error], index=[ 'k', 'local_density_threshold', 'stability', 'prediction_error' ], columns=['stats']) if skip_density_and_return_after_stats: return consensus_stats save_df_to_npz( median_spectra, self.paths['consensus_spectra'] % (k, density_threshold_repl)) save_df_to_npz( rf_usages, self.paths['consensus_usages'] % (k, density_threshold_repl)) save_df_to_npz( consensus_stats, self.paths['consensus_stats'] % (k, density_threshold_repl)) save_df_to_text( median_spectra, self.paths['consensus_spectra__txt'] % (k, density_threshold_repl)) save_df_to_text( rf_usages, self.paths['consensus_usages__txt'] % (k, density_threshold_repl)) # Compute gene-scores for each GEP by regressing usage on Z-scores of TPM tpm = sc.read(self.paths['tpm']) tpm_stats = load_df_from_npz(self.paths['tpm_stats']) if sp.issparse(tpm.X): norm_tpm = (np.array(tpm.X.todense()) - tpm_stats['__mean'].values) / tpm_stats['__std'].values else: norm_tpm = (tpm.X - tpm_stats['__mean'].values) / tpm_stats['__std'].values usage_coef = fast_ols_all_cols(rf_usages.values, norm_tpm) usage_coef = pd.DataFrame(usage_coef, index=rf_usages.columns, columns=tpm.var.index) save_df_to_npz( usage_coef, self.paths['gene_spectra_score'] % (k, density_threshold_repl)) save_df_to_text( usage_coef, self.paths['gene_spectra_score__txt'] % (k, density_threshold_repl)) # Convert spectra to TPM units, and obtain results for all genes by running last step of NMF # with usages fixed and TPM as the input matrix norm_usages = rf_usages.div(rf_usages.sum(axis=1), axis=0) refit_nmf_kwargs.update(dict(H=norm_usages.T.values, )) _, spectra_tpm = self._nmf(tpm.X.T, nmf_kwargs=refit_nmf_kwargs) spectra_tpm = pd.DataFrame(spectra_tpm.T, index=rf_usages.columns, columns=tpm.var.index) save_df_to_npz( spectra_tpm, self.paths['gene_spectra_tpm'] % (k, density_threshold_repl)) save_df_to_text( spectra_tpm, self.paths['gene_spectra_tpm__txt'] % (k, density_threshold_repl)) if show_clustering: if topics_dist is None: topics_dist = squareform(fast_euclidean(l2_spectra.values)) # (l2_spectra was already filtered using the density filter) else: # (but the previously computed topics_dist was not!) topics_dist = topics_dist[ density_filter.values, :][:, density_filter.values] spectra_order = [] for cl in sorted(set(kmeans_cluster_labels)): cl_filter = kmeans_cluster_labels == cl if cl_filter.sum() > 1: cl_dist = squareform(topics_dist[cl_filter, :][:, cl_filter]) cl_dist[ cl_dist < 0] = 0 #Rarely get floating point arithmetic issues cl_link = linkage(cl_dist, 'average') cl_leaves_order = leaves_list(cl_link) spectra_order += list( np.where(cl_filter)[0][cl_leaves_order]) else: ## Corner case where a component only has one element spectra_order += list(np.where(cl_filter)[0]) from matplotlib import gridspec import matplotlib.pyplot as plt width_ratios = [0.5, 9, 0.5, 4, 1] height_ratios = [0.5, 9] fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios))) gs = gridspec.GridSpec(len(height_ratios), len(width_ratios), fig, 0.01, 0.01, 0.98, 0.98, height_ratios=height_ratios, width_ratios=width_ratios, wspace=0, hspace=0) dist_ax = fig.add_subplot(gs[1, 1], xscale='linear', yscale='linear', xticks=[], yticks=[], xlabel='', ylabel='', frameon=True) D = topics_dist[spectra_order, :][:, spectra_order] dist_im = dist_ax.imshow(D, interpolation='none', cmap='viridis', aspect='auto', rasterized=True) left_ax = fig.add_subplot(gs[1, 0], xscale='linear', yscale='linear', xticks=[], yticks=[], xlabel='', ylabel='', frameon=True) left_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape( -1, 1), interpolation='none', cmap='Spectral', aspect='auto', rasterized=True) top_ax = fig.add_subplot(gs[0, 1], xscale='linear', yscale='linear', xticks=[], yticks=[], xlabel='', ylabel='', frameon=True) top_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape( 1, -1), interpolation='none', cmap='Spectral', aspect='auto', rasterized=True) hist_gs = gridspec.GridSpecFromSubplotSpec(3, 1, subplot_spec=gs[1, 3], wspace=0, hspace=0) hist_ax = fig.add_subplot(hist_gs[0, 0], xscale='linear', yscale='linear', xlabel='', ylabel='', frameon=True, title='Local density histogram') hist_ax.hist(local_density.values, bins=np.linspace(0, 1, 50)) hist_ax.yaxis.tick_right() xlim = hist_ax.get_xlim() ylim = hist_ax.get_ylim() if density_threshold < xlim[1]: hist_ax.axvline(density_threshold, linestyle='--', color='k') hist_ax.text(density_threshold + 0.02, ylim[1] * 0.95, 'filtering\nthreshold\n\n', va='top') hist_ax.set_xlim(xlim) hist_ax.set_xlabel( 'Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering' % (sum(~density_filter), len(density_filter), 100 * (~density_filter).mean())) fig.savefig(self.paths['clustering_plot'] % (k, density_threshold_repl), dpi=250) if close_clustergram_fig: plt.close(fig)
def generate_heatmap( dataframe: pd.DataFrame, color_dataframe: pd.DataFrame, histo_dataframe: pd.DataFrame, color_map: dict = None, cluster: bool = True, figsize: tuple = (10, 15)) -> plt.Figure: fig = plt.figure(figsize=figsize) if cluster: linkage = fastcluster.linkage(dataframe.T, "complete", metric="correlation", preserve_input=True) dendrogram_row_ratio = 2 else: linkage = None # Make row smaller without dendrogram dendrogram_row_ratio = 0.1 max_rows = 4 if color_map is not None else 3 # The bar plot is actually on the third row # FIXME: Handle long labels (ratios) if color_map is not None: bar_ratio = 0.25 * len(color_map) height_ratios = [dendrogram_row_ratio, 0.25, bar_ratio, 15] else: height_ratios = [dendrogram_row_ratio, 0.25, 15] gs = grid.GridSpec(max_rows, 2, height_ratios=height_ratios, width_ratios=[0.2, 15]) dendro_ax = fig.add_subplot(gs[0, 1], axisbg="white") # Dendrogram plt.setp(dendro_ax.get_yticklabels(), visible=False) pathway_ax = fig.add_subplot(gs[-1, 0]) # Pathway heatmap_ax = fig.add_subplot(gs[-1, 1], sharey=pathway_ax) # Heatmap # Con gridspec e' necessario fare questo in modo che gli assi Y # non siano visibili plt.setp(heatmap_ax.get_yticklabels(), visible=False) if linkage is not None: leaf_ax = fig.add_subplot(gs[1, 1], sharex=dendro_ax) dendro = sch.dendrogram( linkage, ax=dendro_ax, no_labels=False, labels=dataframe.columns, leaf_rotation=90, ) # Reorder dataframe according to the labels in the leaves dataframe = dataframe[dendro["ivl"]] # Leaf node labels # Put labels in the right order! histo_dataframe = histo_dataframe.loc[dendro["ivl"]] # TRICK: Given that printing labels screws layout because they add an # x axis, we generate a specific axis only with the text, iterating on # the locations of the labels of the dendrogram. After the new text is # in place, we remove the labels from the dendrogram. for leafname, leafcoord in zip(dendro["ivl"], dendro_ax.xaxis.get_ticklocs()): leaf_ax.text(leafcoord, 0.99, leafname, rotation=90, horizontalalignment="center") else: set_axis_parameters(heatmap_ax, dataframe, False) leaf_ax = fig.add_subplot(gs[1, 1], sharex=heatmap_ax) dataframe = dataframe.loc[:, histo_dataframe.index] set_axis_parameters(leaf_ax, dataframe, False) for index, leafcoord in enumerate(leaf_ax.xaxis.get_ticklocs()): leaf_ax.text(leafcoord, 0.99, dataframe.columns[index], rotation=90, horizontalalignment="center") clean_axis(leaf_ax) leaf_ax.grid(False) leaf_ax.axis('off') clean_axis(dendro_ax) if color_map is not None: subgrids = len(color_map) gs_inside = grid.GridSpecFromSubplotSpec( subgrids, 1, subplot_spec=gs[2, 1], height_ratios=[1 for item in color_map]) bars = list() for index, group in enumerate(sorted(color_map)): column = color_map[group] bar_ax = fig.add_subplot(gs_inside[index]) clean_axis(bar_ax) create_colorbar(histo_dataframe, bar_ax, False, column, labels=False) bar_ax.text(-0.25, 0.5, group, horizontalalignment="right", verticalalignment="center") bars.append(bar_ax) create_colorbar(color_dataframe, pathway_ax) cmap, norm = create_colormap() dataframe = dataframe.loc[color_dataframe.index] heatmap1 = heatmap_ax.pcolor(dataframe, cmap=cmap, edgecolors="black", alpha=1, norm=norm) set_axis_parameters(heatmap_ax, dataframe, False) cax = fig.add_axes([-0.05, 1.025, 0.15, 0.025]) cbar = fig.colorbar(heatmap1, cax=cax, orientation="horizontal", ticks=range(9)) cbar.solids.set_edgecolor("face") gs.tight_layout(fig) return fig, cax
def test_dendrogram_plotting(): c.Z = linkage(iris['data'], c.algorithm) labels = np.random.rand(1, iris['data'].shape[0])[0] c.dendrogram = c._calculate_dendrogram(labels) assert len(labels) == len(c.dendrogram['ivl'])
try: boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k] except: boosted_wdfVoc[k] = wdfVoc[k] print "sorted wdfVoc*boost_entity:" print sorted( ((v,k) for k,v in boosted_wdfVoc.iteritems()), reverse=True) ''' #Hclust: fast hierarchical clustering with fastcluster #X is samples by features #distMatrix is sample by samples distances distMatrix = pairwise_distances(X_normalized, metric='cosine') #cluster tweets print "fastcluster, average, cosine" L = fastcluster.linkage(distMatrix, method='average') #for dt in [0.3, 0.4, 0.5, 0.6, 0.7]: #for dt in [0.5]: dt = 0.5 print "hclust cut threshold:", dt # indL = sch.fcluster(L, dt, 'distance') indL = sch.fcluster(L, dt * distMatrix.max(), 'distance') #print "indL:", indL freqTwCl = Counter(indL) print "n_clusters:", len(freqTwCl) print(freqTwCl) # print "silhoutte: ", metrics.silhouette_score(distMatrix, indL, metric="precomputed") allowSiloutte = False for freqTwClkey, freqTwClCount in freqTwCl.iteritems(): if (freqTwClCount > 1):
def cluster_array_to_k_groups(R, k): Z=fastcluster.linkage(R, method='average', metric='euclidean', preserve_input=True) import tree tr=tree.Tree(Z=Z) X=tr.representatives(n_picks=k, l_keep_members=True) return X
img_ds.RasterCount), # number of bands gdal_array.GDALTypeCodeToNumericTypeCode( img_ds.GetRasterBand(1).DataType)) # data type code #print img.shape # warning: that assumed that the raster bands were all the same type (should be true) # reshape the image band by band for b in range(img.shape[2]): img[:, :, b] = img_ds.GetRasterBand(b + 1).ReadAsArray() # reshape image again to match expected format for scikit-learn new_shape = (img.shape[0] * img.shape[1], img.shape[2]) X = img[:, :, :img.shape[2]].reshape(new_shape) # use fastcluster.linkage instead of scipy.cluster.hierarchy.linkage print "calculating linkage.." Z = fc.linkage(X, 'average') # https://en.wikipedia.org/wiki/UPGMA print "calculating dendrogram.." fig = plt.figure(figsize=(10, 10)) # 25, 10 plt.title('hierarchical clustering dendrogram') rotate = False plt.ylabel('distance' if (not rotate) else 'index') plt.xlabel('index' if (not rotate) else 'distance') dn = dendrogram( Z, #truncate_mode='lastp', #p = n_clusters, leaf_rotation=0. if rotate else 90., show_contracted=True, orientation='right' if rotate else 'top',
def cluster(self, method='average', metric='euclidean', l_row=True, l_col=True): if l_row: self.Zr=fastcluster.linkage(self.data, method='average', metric='euclidean', preserve_input=True) #left_dendrogram=clst.dendrogram(Zr, orientation='left') if l_col: self.Zc=fastcluster.linkage(self.data.T, method='average', metric='euclidean', preserve_input=True)
def prog_linkage(X, n_cluster): hclust = linkage(X, method='single') labels = fcluster(hclust, t=n_cluster, criterion="maxclust") return labels - 1
pickle_fp = os.path.join(PICKLE_DATA, "%s.pkl" % run_name) [pair_indexs, data_dict, traj_list, gene_pair_names ] = convert_data_into_np_array(stage_data_dir_name, INDEX_RANGE, pickle_fp, load=False, OFF_SET=OFF_SET, include_sox_and_t=include_sox_and_t, Filter=False) metric = "directed_hausdorff_plus_pair_wise_euclidean" # pair_wise_euclidean_distance distance_fp = os.path.join(NPY_DATA, "%s_%s.npy" % (run_name, metric)) calc_distance_matrix(distance_fp, traj_list) cm = plt.get_cmap('gist_rainbow') p_dist = np.load(distance_fp) Z = fc.linkage(p_dist, method="ward") distance_threshold = 800 if log_transformed else 10 labels = fcluster(Z, t=distance_threshold, criterion="distance") - 1 [ passed_traj_list, passed_labels, passed_pair_indexs, passed_gene_pair_names ] = filter_cluster(traj_list, labels, pair_indexs, gene_pair_names) CLUSTER_PLOT_CMAP = "gist_rainbow" passed_labels = plot_cluster(passed_traj_list, passed_labels, run_name, FIGURE_FORMAT, color_palette=None, log_transformed=log_transformed, cmap=CLUSTER_PLOT_CMAP)
def uhc_cluster(cosmic_list, ref_sig): spectra = [list(ref_sig.values())] # so ref signature is value 0 for sig in cosmic_list: spectra.append(list(cosmic_list[sig].values())) return linkage(spectra, method='ward', metric='cosine')
def hacluster(y): """ Wrapper for the Hierarchical Clustering algorithm from fastcluster """ z = fastcluster.linkage(y, method='single') return z
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic=None,pp=None,gap=None,Tp=None): ############################################################################## # HOW MANY CLUSTERS? ############################################################################### # bootstrap method - sampling without replacement #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K nn_history = defaultdict(dict) trees = defaultdict(dict) dicio_statistics = {k:{} for k in range(min_K,max_K)} for k in range(min_K,max_K): for index in indexes: dicio_statistics[k][index] = [] c_assignments_original = cut_tree(Z, k) # list of clusters for the clustering result with the original data partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist()) trees[k] = partition_original #for each bootstrap sample for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'],method) #for each number of clusters k=min_K,...,max_K for k, partition in trees.items(): c_assignments_bootstrap = cut_tree(Z_bootstrap,k) #list of clusters for the clustering result with the bootstrap sample partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx) #compute 4 different cluster external indexes between the partitions #computed_indexes = cluster_external_index(partition,partition_bootstrap) computed_indexes = clustereval.calculate_external(partition, partition_bootstrap) #print(computed_indexes) for pos, index in enumerate(external_indexes): dicio_statistics[k][index].append(computed_indexes[pos]) for k, partition in trees.items(): calc_idx = clustereval.calculate_internal(results[['patient1', 'patient2', 'score']], partition, k, trees[max_K - 1]) for index in internal_indexes: dicio_statistics[k][index].append(calc_idx[index]) ########################################################################### # DECISION ON THE NUMBER OF CLUSTERS # The correct number of clusters is the k that yield most maximum average values of # clustering indices. # Also the k found before needs to have a low value of standard deviation - it has to # be the minimum between all k's or a value that is somehow still low compared to others ########################################################################### #dataframe that stores the clustering indices averages for each k col = indexes.copy() col.extend(['k', 'k_score_avg']) df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype='float') #dataframe that stores the AR and AW indices standard deviations for each k df_stds = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype = 'float') #computing the means and standard deviations for k in range(min_K,max_K): df_avgs.loc[k]['k'] = k df_stds.loc[k]['k'] = k for index in indexes: if index not in internal_indexes: df_avgs.loc[k][index] = mean(dicio_statistics[k][index]) df_stds.loc[k][index] = stdev(dicio_statistics[k][index]) else: df_avgs.loc[k][index] = dicio_statistics[k][index][0] df_stds.loc[k][index] = dicio_statistics[k][index][0] df_avgs.loc[k]['k_score_avg'] = 0 df_stds.loc[k]['k_score_std'] = 0 #df_stds.loc[k]['k_score_std_2'] = 0 #weights given to each clustering indice, Rand Index does not value as much as the other indices weights = {index: 1/len(indexes) for index in indexes} #found the maximum value for each clustering index and locate in which k it happens # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices columns = df_avgs.columns analyzed_columns = columns[2:-3] for column in analyzed_columns: if column in min_indexes: idx_min = df_avgs[column].idxmin() df_avgs.loc[idx_min]['k_score_avg'] = df_avgs.loc[idx_min]['k_score_avg'] + weights[column] continue idx_max = df_avgs[column].idxmax() df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column] #idx_min_s_dbw = df_avgs['s_dbw'].idxmin() #idx_min_cvnn = df_avgs['cvnn'].idxmin() #df_avgs.loc[idx_min_s_dbw]['k_score_avg'] = df_avgs.loc[idx_min_s_dbw]['k_score_avg'] + weights['s_dbw'] #df_avgs.loc[idx_min_cvnn]['k_score_avg'] = df_avgs.loc[idx_min_cvnn]['k_score_avg'] + weights['cvnn'] #final number of clusters chosen by analysing df_avgs final_k = df_avgs['k_score_avg'].idxmax() if(automatic==0 or automatic==1): fig1 = plt.figure(figsize=(10,5)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') #colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns colLabels1 = external_indexes.copy() colLabels1.append('k') cell_text1 = [] for row in range(len(df_avgs)): cell_text1.append(df_avgs.iloc[row,list(range(len(external_indexes))) + [-2]].round(decimals=3)) plt.title('Average values of eleven external indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method)) the_table = plt.table(cellText=cell_text1, colLabels=colLabels1, loc='center',cellLoc='center') #the_table.auto_set_font_size(False) #the_table.set_fontsize(4) fig1.text(0.1, 0.01, "R = Rand, AR = Adjusted Rand, FM = Fowlkes and Mallows, J = Jaccard, AW = Adjusted Wallace, " "VD = Van Dongen, H = Huberts, H' = Huberts Normalized, F = F-Measure, " "VI = Variation of information, MS = Minkowski", fontsize=5) pp.savefig(fig1) fig2 = plt.figure(3, figsize=(10, 5)) ax = plt.gca() ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) ax.axis('tight') ax.axis('off') # colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns colLabels2 = internal_indexes.copy() colLabels2.append('k') cell_text2 = [] for row in range(len(df_avgs)): cell_text2.append(df_avgs.iloc[row, list(range(len(external_indexes), len(indexes))) + [-2]].round(decimals=3)) plt.title('Average values of six internal indices \n gap: %.2f, Tp: %.2f, %s link' % (gap, Tp, method)) plt.table(cellText=cell_text2, colLabels=colLabels2, loc='center', cellLoc='center', fontsize=20) pp.savefig(fig2) #bar chart of standard deviation - standard deviation of all measures # Create a figure instance # plt.figure(2) # df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8)) # plt.title('Standard deviation of five measures versus number of clusters',fontsize=25) # plt.xlabel('Number of clusters',labelpad=20,fontsize=20) # plt.ylabel('Standard deviation',labelpad=10,fontsize=20) # plt.xticks(size = 20) # plt.yticks(size = 20) # plt.show() fig3 = plt.figure(4) df_stds.loc[:,'AR'].plot.bar(figsize=(15,8),color='forestgreen') plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25) plt.xlabel('Number of clusters',labelpad=20,fontsize=15) plt.ylabel('Standard deviation',labelpad=10,fontsize=15) plt.xticks(size = 20) plt.yticks(size = 20) #plt.show() pp.savefig(fig3) return [df_avgs,df_stds,final_k]
def __call__(self, X): return linkage(X, method='weighted')
M[:, 1] = ys scales = np.exp(np.linspace(np.log(Lmin), np.log(Lmax), Nscales)) del MASS, Observable print(' Scale Loop \n') for scale in scales: print(' This scale : %f\t \n' % scale) l = 1.1 * scale / float(ds.length_unit.in_units('pc')) ls = 1.1 * scale distance = sch.distance.pdist( M) # vector of (100 choose 2) pairwise distances Link = fastcluster.linkage(distance, method='complete') ind = sch.fcluster(Link, l, 'distance') xcm = [] ycm = [] for j in set(ind): temp_mass = mass[ind == j].sum() if temp_mass > 10: xcm.append(np.average(xs[ind == j], weights=mass[ind == j])) ycm.append(np.average(ys[ind == j], weights=mass[ind == j])) del temp_mass print(' Number of regions :%d\t \n' % len(xcm))
def cluster_validation(M, method, k, partition_found, df_encoded, results): #dictionary to store all computed indexes for each cluster dicio_cluster_validation = {k: {} for k in range(1, k + 1)} for k in range(1, k + 1): dicio_cluster_validation[k]['jaccard'] = [] dicio_cluster_validation[k]['dice'] = [] dicio_cluster_validation[k]['asymmetric'] = [] #assess cluster stability for K=k that was the number of clusters chosen for i in range(M): # sampling rows of the original data idx = np.random.choice(len(df_encoded), int((3 / 4) * len(df_encoded)), replace=False) idx = np.sort(idx) #get all the possible combinations between the sampled patients patient_comb_bootstrap = list( itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2)) patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap, columns=['patient1', 'patient2']) #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1', 'patient2']) # Hierarchical Clustering of the bootstrap sample Z_bootstrap = linkage(results_bootstrap['score'], method) c_assignments_bootstrap = cut_tree(Z_bootstrap, k) partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx) for k_i in range(1, k + 1): aux_jaccard = [] aux_dice = [] aux_asymmetric = [] for i in range(1, k + 1): aux = cluster_validation_indexes(partition_found[k_i - 1], partition_bootstrap[i - 1]) aux_jaccard.append(aux[0]) aux_dice.append(aux[2]) aux_asymmetric.append(aux[1]) dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard)) dicio_cluster_validation[k_i]['dice'].append(max(aux_dice)) dicio_cluster_validation[k_i]['asymmetric'].append( max(aux_asymmetric)) #obtain the average cluster external indexes for each number of clusters jaccard_cluster_median = [] dice_median = [] asymmetric_median = [] jaccard_cluster_avg = [] dice_avg = [] asymmetric_avg = [] jaccard_cluster_std = [] dice_std = [] asymmetric_std = [] table = [] cluster_sizes = [] for k in range(1, k + 1): jaccard_cluster_median.append( round(median(dicio_cluster_validation[k]['jaccard']), 3)) dice_median.append( round(median(dicio_cluster_validation[k]['dice']), 3)) asymmetric_median.append( round(median(dicio_cluster_validation[k]['asymmetric']), 3)) jaccard_cluster_avg.append( round(mean(dicio_cluster_validation[k]['jaccard']), 3)) dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']), 3)) asymmetric_avg.append( round(mean(dicio_cluster_validation[k]['asymmetric']), 3)) jaccard_cluster_std.append( round(stdev(dicio_cluster_validation[k]['jaccard']), 3)) dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']), 3)) asymmetric_std.append( round(stdev(dicio_cluster_validation[k]['asymmetric']), 3)) cluster_sizes.append(len(partition_found[k - 1])) table.append([ str(k) + ' (' + str(len(partition_found[k - 1])) + ')', jaccard_cluster_median[k - 1], dice_median[k - 1], asymmetric_median[k - 1], jaccard_cluster_avg[k - 1], dice_avg[k - 1], asymmetric_avg[k - 1], jaccard_cluster_std[k - 1], dice_std[k - 1], asymmetric_std[k - 1] ]) headers = [ 'Cluster Number', 'J_median', 'D_median', 'A_median', 'J_avg', 'D_avg', 'A_avg', 'J_std', 'D_std', 'A_std' ] print(tabulate(table, headers)) cluster_stability = [ jaccard_cluster_median, dice_median, asymmetric_median, jaccard_cluster_avg, dice_avg, asymmetric_avg, jaccard_cluster_std, dice_std, asymmetric_std, cluster_sizes ] return cluster_stability
def color_palette(frame_bgr, mask=None, mask_index=None, n_merge_steps=100, image_size=400.0, seeds_model=None, n_pixels=400, n_merge_per_lvl=10, mask_inverse=False, normalization_lower_bound=100.0, seeds_input_width=600, use_lab=True, show_seed=False, seed_labels=None) -> PaletteAsset: """ Computes a hierarchical color palette as generated by VIAN, does not keep the original tree. :param frame_bgr: A frame in bgr uint8, currently float32 is not allowed since OpenCV may crash on it :param mask: An optional mask of labels :param mask_index: The label which the palette should be computed on :param mask_inverse: If true, all but the given mask_index will be computed. :param n_merge_steps: Number of merge steps to return (approximately), this is restricted by the :param image_size: image size to compute on :param seeds_model: the seeds model can optionally be given as argument to avoid initialization after each image :param n_pixels: number of super pixels to compute (approximately) :param n_merge_per_lvl: After the first 10 merges, every nth depth to store in the result :param normalization_lower_bound: Minimal number of pixels to keep a cluster :param seeds_input_width: input for the seeds model :param use_lab: if false, RGB will used for average computation instead of lab :param show_seed: if true, the seeds output will be shown in opencv, make sure to put cv2.waitKey() to see the result :return: PaletteAsset """ frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2LAB) if seeds_input_width < frame.shape[0]: rx = seeds_input_width / frame.shape[0] frame = cv2.resize(frame, None, None, rx, rx, cv2.INTER_CUBIC) if seed_labels is None: if seeds_model is None: seeds_model = PaletteExtractorModel(frame, n_pixels=n_pixels, num_levels=4) labels = seeds_model.forward(frame, 200).astype(np.uint8) else: labels = seed_labels if show_seed: cv2.imshow( "SEED", cv2.cvtColor(seeds_model.labels_to_avg_color_mask(frame, labels), cv2.COLOR_LAB2BGR)) fx = image_size / frame.shape[0] frame = cv2.resize(frame, None, None, fx, fx, cv2.INTER_CUBIC) labels = cv2.resize(labels, None, None, fx, fx, cv2.INTER_NEAREST) frame_bgr = cv2.resize(frame_bgr, None, None, fx, fx, cv2.INTER_CUBIC) if mask is not None: mask = cv2.resize(mask, (labels.shape[1], labels.shape[0]), None, cv2.INTER_NEAREST) if mask_inverse: labels[np.where(mask == mask_index)] = 255 else: labels[np.where(mask != mask_index)] = 255 bins = np.unique(labels) bins = np.delete(bins, np.where(bins == 255)) else: bins = np.unique(labels) data = [] hist = np.histogram(labels, bins=bins) normalization_f = np.amin(hist[0]) if normalization_f < normalization_lower_bound: normalization_f = normalization_lower_bound labels_list = [] colors_list = [] all_cols = [] all_labels = [] for i, bin in enumerate(hist[0]): if bin < normalization_f: continue lbl = hist[1][i] if use_lab: avg_color = np.round( cv2.cvtColor( np.array( [[np.mean(frame[np.where(labels == lbl)], axis=0)]], dtype=np.uint8), cv2.COLOR_LAB2BGR)[0, 0]).astype(np.uint8) else: avg_color = np.round( np.mean(frame_bgr[np.where(labels == lbl)], axis=0)).astype(np.uint8) labels_list.append(lbl) colors_list.append(avg_color) data.extend([avg_color] * int(np.round(bin / normalization_f)) * 2) all_cols.extend([avg_color] * int(np.round(bin / normalization_f)) * 2) all_labels.extend([lbl] * int(np.round(bin / normalization_f)) * 2) data = np.array(data) Z = linkage(data, 'ward') tree, merge_dists = to_cluster_tree(Z, all_labels, all_cols, n_merge_steps, n_merge_per_lvl) return PaletteAsset(tree, merge_dists)
if len(corp)>4: wordCorps.append(corp) #----------------------------------------------------------------------------------------------------------------------------------------------- '''train and pick trained word vec''' dirs = "C:\\Users\\Administrator.NBJXUEJUN-LI\\Desktop\\project\\Python\\NLP\\savedObject\\CompCorpus\\" slm = pickle.load(open(dirs+"slm.pkl","rb")) '''perform kmeans cluster without normalize''' from sklearn.cluster import KMeans TopicNums = 10 wordNums = slm.wordvec.shape[0] kmeansFit = KMeans(n_clusters=TopicNums) kmeansFit.fit(slm.wordvec) '''perform hierachical cluster''' import fastcluster result = fastcluster.linkage (X = slm.wordvec, method='single', metric='euclidean', preserve_input='False') '''compute word depth''' clustStruct = {} for ridx in range(result.shape[0]): cidx = int(ridx+wordNums) clustStruct.setdefault(cidx,np.zeros(wordNums, dtype=np.int)) for i in [0,1]: code = int(result[ridx][i]) if code<wordNums: clustStruct[cidx][code]+=1 else: clustStruct[cidx]+=(clustStruct[code]+ (clustStruct[code]!=0).astype('int')) wordDepth = clustStruct[max(clustStruct.keys())] #----------------------------------------------------------------------------------------------------------------------------------------------- '''compute the word degree within sentence co-occurance cross docs''' sentWindow = 3
def cluster(dupes: numpy.ndarray, cluster_threshold: float = 0.5, max_components: int = 30000, id_to_match: str = None) -> Clusters: """ Takes in a list of duplicate pairs and clusters them in to a list records that all refer to the same entity based on a given threshold `https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html` Args: dupes: (np.array)[tuple(list[str], float)] A list of tuples, where each tuple contains an id pair and a probability that they are a match: id_pair_tuple: ([record_id_1, record_id_2], prob) dtype: np.dtype([('pairs', '<U256', 2), ('score', 'f4', 1)]) threshold: (float) number betweent 0 and 1 (default is .5). lowering the number will increase precision, raising it will increase recall """ distance_threshold = cluster_threshold score_threshold = 1 - cluster_threshold dupe_sub_graphs = connected_components(dupes, max_components) # logger.info(f"Dupes: {dupes}") for sub_graph in dupe_sub_graphs: if len(sub_graph) > 1: i_to_id, condensed_distances, N = condensed_distance(sub_graph) logger.debug(f"{condensed_distances}") linkage = fastcluster.linkage(condensed_distances, method='centroid', preserve_input=True) partition = hcluster.fcluster(linkage, distance_threshold, criterion='distance') clusters: Dict[int, List[int]] = defaultdict(list) logger.debug(f"Partition: {partition}") logger.debug(f"Linkage: {linkage}") for i, cluster_id in enumerate(partition): clusters[cluster_id].append(i) logger.info(f"Clusters: {clusters}") for cluster in clusters.values(): if len(cluster) > 1: scores = confidences(cluster, condensed_distances, N) logger.info( f"Cluster Ids and scores: {tuple(i_to_id[i] for i in cluster)}, {scores}" ) ids = [i_to_id[i] for i in cluster] if id_to_match in ids and id_to_match is not None: yield tuple(ids), scores elif id_to_match is None: yield tuple(ids), scores else: (ids, score), = sub_graph if score > score_threshold and id_to_match in ids and id_to_match is not None: # logger.info(tuple(ids), ((score,) * 2)) yield tuple(ids), (score, ) * 2 elif score > score_threshold and id_to_match is None: yield tuple(ids), (score, ) * 2
def __call__(self, X): return linkage(X, method='average')
fname = os.path.basename(fname) if fname.endswith('.distances'): pdb = fname[:-10] index[pdb] = i length = len(index) npArray = np.zeros((length, length)) for filename in glob.glob('/home/lmt72/PDBdistances/*.distances'): distanceFile = open(filename) filename = os.path.basename(filename) if filename.endswith('.distances'): pdb = filename[:-10] for line in distanceFile: data = line.split() secondProtein = data[0] distance = float(data[1].strip()) i = index[pdb] npArray[i, index[secondProtein]] = distance npArray[index[secondProtein], i] = distance names = ['' for x in xrange(length)] for (name, i) in index.iteritems(): names[i] = name print npArray Z1 = fcl.linkage(npArray, method='average') l1 = sch.leaves_list(Z1) D = (npArray[l1]) Z2 = fcl.linkage(npArray.T, method='average') l2 = sch.leaves_list(Z2) D = D[:, l2] cPickle.dump((npArray, D, Z1, names), open("clusterstate.pickle", 'w'), -1)
def __call__(self, X): return linkage(X, method='ward')
def __call__(self, X): return linkage(X, method='complete')
num_points_per_bb = 50 data_generator = DataGenerator(bounding_boxes, num_points_per_bb, window) points = data_generator.load_points_from_csv('points.csv') #points = data_generator.generate_points() canvas = Canvas(window, width=1024, height=768, bg='white') colors = ['white', 'yellow', 'cyan', 'red', 'blue', 'brown', 'green'] np_points = np.zeros((len(points), 2)) for i in range(len(points)): np_points[i][0] = points[i].x np_points[i][1] = points[i].y start_time = time.time() np_clusters = fastcluster.linkage(np_points, method='single', metric='euclidean') print(np_clusters) clusters = [] for i in range(len(points)): cluster = Cluster(i) cluster.populate([points[i]]) clusters.append(cluster) def get_cluster_by_id(id): for i in range(len(clusters)): if clusters[i].id == id: return i
def __call__(self, X): return linkage(X, method='single')
def __initial_match(self, candidate_list: (np.ndarray, np.generic), min_pts=2, t=50, criterion='distance'): # TODO group matching for non-grouped user # 1 : dbscan algorithm + gps based movement vector alignment -> clear! # 2 : acceleration -> let's discuss """Performs initial-clustering on cn candidate_list(nT x 2 numpy array) and returns group lists. Parameters ---------- candidate_list : array of shape (n_samples, n_of_time_steps, pair of latitude and longitude min_pts : minimum members of a group for HDBSCAN-algorithm t : scalar For criteria 'inconsistent', 'distance' or 'monocrit', this is the threshold to apply when forming flat clusters. For 'maxclust' or 'maxclust_monocrit' criteria, this would be max number of clusters requested. criterion : str, optional The criterion to use in forming flat clusters. This can be any of the following values: ``inconsistent`` : If a cluster node and all its descendants have an inconsistent value less than or equal to `t` then all its leaf descendants belong to the same flat cluster. When no non-singleton cluster meets this criterion, every node is assigned to its own cluster. (Default) ``distance`` : Forms flat clusters so that the original observations in each flat cluster have no greater a cophenetic distance than `t`. ``maxclust`` : Finds a minimum threshold ``r`` so that the cophenetic distance between any two original observations in the same flat cluster is no more than ``r`` and no more than `t` flat clusters are formed. ``monocrit`` : Forms a flat cluster from a cluster node c with index i when ``monocrit[j] <= t``. For example, to threshold on the maximum mean distance as computed in the inconsistency matrix R with a threshold of 0.8 do:: MR = maxRstat(Z, R, 3) cluster(Z, t=0.8, criterion='monocrit', monocrit=MR) ``maxclust_monocrit`` : Forms a flat cluster from a non-singleton cluster node ``c`` when ``monocrit[i] <= r`` for all cluster indices ``i`` below and including ``c``. ``r`` is minimized such that no more than ``t`` flat clusters are formed. monocrit must be monotonic. For example, to minimize the threshold t on maximum inconsistency values so that no more than 3 flat clusters are formed, do:: MI = maxinconsts(Z, R) cluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI) Returns ---------- groups : list of shape (n_clusters, n_members) Examples ---------- >>> candidate_list = np.array([,...,], shape=[5,3,2]) -> labels of candidate_list = [0,1,0,1,0] >>> groups = [[0,2,4],[1,3]] """ assert isinstance(candidate_list, (np.ndarray, np.generic)) num_of_data, num_time_steps, _ = candidate_list.shape X = np.array([ candidate_list[i, num_time_steps - 1, :] for i in range(num_of_data) ]) rads = np.radians(X) # [N,2] # Clustering with gps-data of 1-time step. # 'haversine' do clustering using distance transformed from (lat, long) clusterer = hdbscan.HDBSCAN(min_cluster_size=min_pts, min_samples=2, metric='haversine') labels = clusterer.fit_predict(rads) print('Before trajectory clustering, labels are ', labels) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) groups = [] for ulb in range(n_clusters_): groups.append([]) for i, lb in enumerate(labels): if lb == -1: continue groups[lb].append(i) total_n_clusters = n_clusters_ # Group refinement considering user's trajectory for nc in range(n_clusters_): group_member_mask = (labels == nc) group_members = candidate_list[group_member_mask] pdist = tdist.pdist(group_members.transpose([0, 2, 1]), metric="sspd", type_d="spherical") Z = fc.linkage(pdist, method="ward") sub_labels = sch.fcluster(Z, t, criterion=criterion) - 1 unique_sub_labels = len(set(sub_labels)) if unique_sub_labels == 1: continue for ad in range(unique_sub_labels - 1): groups.append([]) member_indices = list( compress(range(len(group_member_mask)), group_member_mask)) for sb in range(unique_sub_labels): sub_group_mask = (sub_labels == sb) sub_member_indices = list( compress(range(len(sub_group_mask)), sub_group_mask)) # Noise case if len(sub_member_indices) == 1: groups[nc].remove(member_indices[sub_member_indices[0]]) labels[member_indices[sub_member_indices[0]]] = -1 continue for m in range(len(sub_member_indices)): # remove from wrong group groups[nc].remove(member_indices[sub_member_indices[m]]) # add to refined group groups[total_n_clusters].append( member_indices[sub_member_indices[m]]) labels[member_indices[ sub_member_indices[m]]] = total_n_clusters total_n_clusters += 1 print('After trajectory clustering, labels are ', labels) return groups.copy()