def _guide_tree(self, dist_matrix): """ @summary: Build a guide tree from the distance matrix @param dist_matrix: The distance matrix @type dist_matrix: numpy.ndarray @return: Pycluster similarity tree @rtype: Pycluster.cluster.Tree @author: Woon Wai Keen @author: Vladimir Likic """ n = len(dist_matrix) print " -> Clustering %d pairwise alignments." % (n * (n - 1)), tree = Pycluster.treecluster(dist_matrix, method='a') print '\n' print tree # x = 1 # for i in list(tree): # print (i, x) # x += 1 #return different for of tree perhaps , list within list ** print "Done" return tree
def DoClustering(self, nclusters=30): '''Main clustering function''' gx = self._gx func = self._scale_function nid, jm, am, fg = zip(*[(x, gx.node[x]['JuvenileMass'], gx.node[x]['AdultMass'], gx.node[x]['FunctionalGroup']) for x in gx.node.keys()]) data = np.c_[func(jm), func(am)] if (self._normalize_data == True): data = whiten(data) data = np.c_[data, 1000 * np.array(fg)] if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING: if not self._tree_done: if self._distance_matrix: self._tree = pc.treecluster( distancematrix=self._distance_matrix) else: self._tree = pc.treecluster(data) self._tree_done = True self._data = data self._nodes_ids = nid clusters_ids = self._tree.cut(nclusters) self._clusters_ids = clusters_ids self._nclusters = len(np.unique(self._clusters_ids)) cluster_attrib = dict(zip(nid, clusters_ids)) nx.set_node_attributes(gx, 'cluster', cluster_attrib) self._gx = gx for cid in clusters_ids: fg = [ gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster'] == cid ] if len(np.unique(fg)) is not 1: raise Exception( 'Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!' )
def DoClustering(self,nclusters=30,distance_matrix=None): #Avoid working two times if not self._tree_done: df_nc = self._df_nodes[self._df_nodes['ID']>=0].copy() data = df_nc[['JuvenileMass', 'AdultMass']] data = data.as_matrix() data = self._scale_function(data) if(self._normalize_data==True): data = whiten(data) data = np.c_[data,100.*df_nc.FunctionalGroup.values] if distance_matrix: self._tree = pc.treecluster(distancematrix=distance_matrix) else: self._tree = pc.treecluster(data) self._data = data self._tree_done = True self.FillClusterIndividualData(self._tree.cut(nclusters))
def DoClustering(self,nclusters=30): '''Main clustering function''' gx = self._gx; func = self._scale_function nid,jm,am,fg=zip(*[(x,gx.node[x]['JuvenileMass'],gx.node[x]['AdultMass'],gx.node[x]['FunctionalGroup']) for x in gx.node.keys()]) data = np.c_[func(jm),func(am)] if(self._normalize_data==True): data = whiten(data) data = np.c_[data,1000*np.array(fg)] if self._algorithm == Aggregation._HIERARCHICAL_CLUSTERING: if not self._tree_done: if self._distance_matrix: self._tree = pc.treecluster(distancematrix=self._distance_matrix) else: self._tree = pc.treecluster(data) self._tree_done = True self._data = data self._nodes_ids = nid clusters_ids = self._tree.cut(nclusters) self._clusters_ids = clusters_ids self._nclusters = len(np.unique(self._clusters_ids)) cluster_attrib = dict(zip(nid,clusters_ids)) nx.set_node_attributes(gx,'cluster',cluster_attrib) self._gx = gx for cid in clusters_ids: fg = [gx.node[x]['FunctionalGroup'] for x in gx.node.keys() if gx.node[x]['cluster']==cid] if len(np.unique(fg)) is not 1: raise Exception('Many functional groups inside the same cluster!!!!!! A CRASH JUST HAPPENED, just joking!!!!')
def generate_network_clusters(G): # Function creates the cluster partitions using heierarchical clustering # on geodesic distances # First check to make sure the given network is a single fully # connected component. if len(NX.component.connected_component_subgraphs(G)) >1: raise NX.NetworkXError, 'G must be single component! Extract main component...' # Now generte clusters dist_matrix=get_dist_matrix(G) # Default Heierarchical Clustering algo used hclus=PC.treecluster(data=None,distancematrix=dist_matrix,method='m') partitions={} # create dictionary of partitioning at each cut in heierarchy for c in range(1,len(hclus)+1): # treecluster cuts start at 1 partitions[c]=hclus.cut(c).tolist() return partitions
def tree_cluster_test(data,real_labels, outputfile = None): start = time.time() tree = Pycluster.treecluster(data, method='m') ks = range(25,50,1) if outputfile != None: f = open(outputfile,'w') f.write(out_result_header()) for k in ks: print 'hierachical clustering whn k=%d' % k predicted = tree.cut(k).tolist() if outputfile != None: f.write(out_result(predicted,k, real_labels)) elasped = time.time() - start print 'hierarchical clustering time: %.3f' % (elasped/float(len(ks)))
def tree_cluster_test(data, real_labels, outputfile=None): start = time.time() tree = Pycluster.treecluster(data, method='m') ks = range(25, 50, 1) if outputfile != None: f = open(outputfile, 'w') f.write(out_result_header()) for k in ks: print 'hierachical clustering whn k=%d' % k predicted = tree.cut(k).tolist() if outputfile != None: f.write(out_result(predicted, k, real_labels)) elasped = time.time() - start print 'hierarchical clustering time: %.3f' % (elasped / float(len(ks)))
def hierarchical(flat_data, data, nclusters, method, distance): """ Hierarchical clustering """ tree = pc.treecluster(data=flat_data.values(), mask=None, weight=None, transpose=0, method=method, dist=distance, distancematrix=None) clusterid = tree.cut(nclusters) clusters = defaultdict(list) for i, j in zip(clusterid, data): clusters[i].append(j) return clusters
def _guide_tree(self, dist_matrix): """ @summary: Build a guide tree from the distance matrix @param dist_matrix: The distance matrix @type dist_matrix: numpy.ndarray @return: Pycluster similarity tree @rtype: Pycluster.cluster.Tree @author: Woon Wai Keen @author: Vladimir Likic """ n = len(dist_matrix) print " -> Clustering %d pairwise alignments." % (n * (n - 1)), tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a') print "Done" return tree
def _guide_tree(self, dist_matrix): """ @summary: Build a guide tree from the distance matrix @param dist_matrix: The distance matrix @type dist_matrix: numpy.ndarray @return: Pycluster similarity tree @rtype: Pycluster.cluster.Tree @author: Woon Wai Keen @author: Vladimir Likic """ n = len(dist_matrix) print " -> Clustering %d pairwise alignments." % (n*(n-1)), tree = Pycluster.treecluster(distancematrix=dist_matrix, method='a') print "Done" return tree
def diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, mindiff=0, minenr=3, minfreq=0.01): w_ratio = np.array([14, len(names), len(names) + 1]) plot_order = [0, 1, 2] nbar = 5 freq = np.array(freq) counts = np.array(counts) bgfreq = np.array([[x] for x in bgfreq]) enr = np.log2(np.divide(freq, bgfreq)) filt = np.ones(len(enr), dtype="bool") filters = [ np.sum(enr > minenr, 1) > 0, np.sum(freq > minfreq, 1) > 0, (np.max(enr, 1) - np.min(enr, 1)) > mindiff, np.sum(counts > 2, 1) > 0, ] for f in filters: filt = np.logical_and(filt, f) print "Filter: ", sum(filt) motifs = np.array(motifs)[filt] freq = freq[filt] bgfreq = bgfreq[filt] enr = enr[filt] for m, f, b, e in zip(motifs, freq, bgfreq, enr): sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format(m, f, b, e)) if len(freq) == 0: sys.stderr.write("No enriched and/or differential motifs found.\n") return elif len(freq) >= 3: tree = Pycluster.treecluster(freq, method="m", dist="c") ind = sort_tree(tree, np.arange(len(motifs))) else: ind = np.arange(len(freq)) fig = plt.figure(figsize=((5 + 0.75 * len(names)) * 3, (0.3 * len(motifs) + 1.5) * 3)) gs = GridSpec( len(motifs) + 3 + nbar, 3, height_ratios=[1] * nbar + [3] * (len(motifs) + 3), width_ratios=w_ratio[plot_order] ) # Colormaps c1 = mpl.cm.RdBu c2 = mpl.cm.Blues ##create_colormap("white", "blue") ### Frequency plot ### # Create axis ax = plt.subplot(gs[nbar:-3, plot_order[2]]) # Plot frequencies vmin = 0 vmax = 0.3 pfreq = np.hstack((freq, bgfreq)) ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax) sm = plt.cm.ScalarMappable(cmap=c2, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax)) # Show percentages for y, row in enumerate(pfreq[ind]): for x, val in enumerate(row): v = vmax if val >= (vmin + ((vmax - vmin) / 2)): v = vmin plt.text(x + 0.5, y + 0.5, "{:.1%}".format(val), ha="center", va="center", color=sm.to_rgba(v)) # Hide most labels plt.setp(ax.get_xticklines(), visible=False) plt.setp(ax.get_yticklines(), visible=False) plt.setp(ax.get_yticklabels(), visible=False) # Set the X labels ticks = np.arange(len(names) + 1) + 0.5 plt.xticks(ticks, names + ["background"], rotation=30, ha="right") ax.set_ylim(0, len(motifs)) # Title plt.title("Frequency") # Colorbar sm._A = [] cax = plt.subplot(gs[0, plot_order[2]]) cb = fig.colorbar(sm, cax=cax, ticks=[0, 0.3], orientation="horizontal") cb.ax.set_xticklabels(["0%", "30%"]) #### Enrichment plot ax = plt.subplot(gs[nbar:-3, plot_order[1]]) vmin = -10 vmax = 10 ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax) for y, row in enumerate(enr[ind]): for x, val in enumerate(row): col = "black" if val >= (vmin + ((vmax - vmin) / 8.0 * 7)): col = "white" elif val <= (vmin + ((vmax - vmin) / 8.0)): col = "white" plt.text(x + 0.5, y + 0.5, "{:.1f}".format(val), ha="center", va="center", color=col) ticks = np.arange(len(names)) + 0.5 plt.xticks(ticks, names, rotation=30, ha="right") # plt.setp(plt.xticks()[1], rotation=30) # for label in labels: # label.set_rotation(30) ticks = np.arange(len(motifs)) + 0.5 plt.yticks(ticks, motifs[ind]) plt.setp(ax.get_xticklines(), visible=False) plt.setp(ax.get_yticklines(), visible=False) ax.set_ylim(0, len(motifs)) # Title plt.title("Enrichment (log2)") # Colorbar sm = plt.cm.ScalarMappable(cmap=c1, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax)) sm._A = [] cax = plt.subplot(gs[0, plot_order[1]]) cb = fig.colorbar(sm, cax=cax, ticks=[vmin, 0, vmax], orientation="horizontal") cb.ax.set_xticklabels([vmin, 0, vmax]) #### Motif logos for i, motif in enumerate(motifs[ind][::-1]): ax = plt.subplot(gs[i + nbar, plot_order[0]]) axes_off(ax) tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png") pwms[motif].to_img(tmp.name, format="PNG", height=6) ax.imshow(plt.imread(tmp.name), interpolation="none") # plt.show() plt.savefig(outfile, dpi=300, bbox_inches="tight") plt.close(fig)
strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i,j) = mirror_clusters(data, labels) ind = labels.argsort() # Other cluster implementation # centres, labels, dist = kmeanssample(clus, options.numclusters, len(clus) / 10, metric=cl, maxiter=200, verbose=1, delta=0.00001) elif cluster_type == "h": print "Hierarchical clustering" tree = Pycluster.treecluster(clus, method="m", dist=METRIC) labels = tree.cut(options.numclusters) ind = sort_tree(tree, arange(len(regions))) else: ind = arange(len(regions)) labels = zeros(len(regions)) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom,start,end,gene,strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster+1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster+1, strand)) f.close() if not cluster_type == "k": labels = None
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print "ERROR: Data file '{0}' does not exist".format(x) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format( x) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction[0].lower() dynam = args.graphdynamics fontsize = args.textfontsize # Check for mutually exclusive parameters if dynam: if merge_mirrored: print "ERROR: -m and -g option CANNOT be used together" sys.exit(1) if distancefunction == 'e': print 'Dynamics can only be identified using Pearson correlation as metric.' print 'Assigning metric to Pearson correlation' distancefunction = 'p' # Warning about too much files if (len(tracks) > 4): print "Warning: Running fluff with too many files might make you system use enormous amount of memory!" # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write( "You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = range(len(datafiles)) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["e", "p"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: if distancefunction == "e": METRIC = cfg.DEFAULT_METRIC print "Euclidean distance method" else: METRIC = "c" print "Pearson distance method" ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print "Loading data" try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async(load_heatmap_data, args=(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([ norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick) ]) # Clustering if cluster_type == "k": print "K-means clustering" ## K-means clustering # PyCluster labels, _, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC) if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in data.keys(): data[track][labels == j] = [ row[::-1] for row in data[track][labels == j] ] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) ind = labels.argsort() # Hierarchical clustering elif cluster_type == "h": print "Hierarchical clustering" tree = Pycluster.treecluster(clus, method="m", dist=METRIC) labels = tree.cut(args.numclusters) ind = sort_tree(tree, arange(len(regions))) else: ind = arange(len(regions)) labels = zeros(len(regions)) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip( array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format( chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format( regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format( readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print "ERROR: Data file '{0}' does not exist".format(x) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction[0].lower() dynam = args.graphdynamics fontsize = args.textfontsize # Check for mutually exclusive parameters if dynam: if merge_mirrored: print "ERROR: -m and -g option CANNOT be used together" sys.exit(1) if distancefunction == 'e': print 'Dynamics can only be identified using Pearson correlation as metric.' print 'Assigning metric to Pearson correlation' distancefunction = 'p' # Warning about too much files if (len(tracks) > 4): print "Warning: Running fluff with too many files might make you system use enormous amount of memory!" # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] else: pick = range(len(datafiles)) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["e", "p"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: if distancefunction == "e": METRIC = DEFAULT_METRIC print "Euclidean distance method" else: METRIC = "c" print "Pearson distance method" ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=[]): # Calculate the profile data data = {} regions = [] print "Loading data" try: # Load data in parallel import multiprocessing pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append(pool.apply_async(load_heatmap_data, args=( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except: sys.stderr.write("Python multiprocessing not installed, can't load data in parallel\n") for datafile in datafiles: track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin guard = [] if dynam: amount_bins = 1 extend_dyn_up = 1000 extend_dyn_down = 1000 # load the data once to get the features which extend below 0 guard = check_data(featurefile, guard, dynam, extend_dyn_up, extend_dyn_down) else: amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, DEFAULT_PERCENTILE) clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)]) # Clustering if cluster_type == "k": print "K-means clustering" ## K-means clustering # PyCluster labels, error, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC) if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in data.keys(): data[track][labels == j] = [row[::-1] for row in data[track][labels == j]] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) ind = labels.argsort() # Hierarchical clustering elif cluster_type == "h": print "Hierarchical clustering" tree = Pycluster.treecluster(clus, method="m", dist=METRIC) labels = tree.cut(args.numclusters) ind = sort_tree(tree, arange(len(regions))) else: ind = arange(len(regions)) labels = zeros(len(regions)) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for bin in row: if not bins: bins = '{0}'.format(bin) else: bins = '{0};{1}'.format(bins, bin) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t'.format(track)) for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
data = eventutils.trim_data_to_events(data, events, trimming_range) input_vecs = [] if treat_data_differentially: input_vecs = utils.make_prices_diffs_vecs(data) else: input_vecs = utils.make_prices_vecs(data) # Run clustering algorithm. if algorithm_type == ClusterAlg.KMEANS: labels, wcss, n = Pycluster.kcluster(input_vecs, number_of_clusters, dist = dist_measure, npass = number_of_iters, method = dist_method) elif algorithm_type == ClusterAlg.HIERARCHICAL: tree = Pycluster.treecluster(input_vecs, method = dist_method, dist = dist_method) labels = tree.cut(number_of_clusters) elif algorithm_type == ClusterAlg.SELFORGMAPS: labels, celldata = Pycluster.somcluster(input_vecs, nxgrid = xgrid, nygrid = ygrid, niter = number_of_iters) # If algorithm is self-organizing maps each item is assigned to # a particular 2D point, so we need to create groups from 2D points. # See implementation of making groups from labels for details. if algorithm_type == ClusterAlg.SELFORGMAPS: clusters = utils.make_groups_from_labels(labels, data, True) else: clusters = utils.make_groups_from_labels(labels, data) # Check with which type of key we have to deal with.
red1 = cv2.inRange(imgHSV,a([0,150,80]),a([3,255,255])) red2 = cv2.inRange(imgHSV,a([150,150,80]),a([180,255,255])) red = red1 | red2 red = clean(red) stickers += detect(red) for i in range(len(stickers)): stickers[i].id = i stickertime=clock()*1000 print "stickers detected in " + str(int(stickertime-starttime)) + " ms" #CLUSTERING D=[[distance(st1,st2) for st1 in stickers] for st2 in stickers] tree = Pycluster.treecluster(distancematrix=D) cluster_count = 1 while True: clusters = tree.cut(cluster_count) for i in range(len(stickers)): #Debug if (stickers[i].weight>1 and cluster_count<7): cv2.drawContours(eval("klastry"+str(cluster_count)), np.array([stickers[i].V]), 0, 255*(clusters[i]+1)/(cluster_count+1),-1) weights=[0 for i in range(cluster_count+1)] for i in range(len(stickers)): weights[clusters[i]] += stickers[i].size + stickers[i].weight maxcluster_weight = 0 maxcluster_id = 0 maximum = 0 for i in range(cluster_count): if weights[i]>maximum:
nclusters=noClust, transpose=0, method=mDict[method], dist=dDict[distance]) silScore = metrics.silhouette_score(np.array(rawData)[:, actHistDNaseList], clusterListActiveHistDNase, metric='euclidean') silhouetteList.append(silScore) # Hierarchical if (algorithm == "h"): # Method mDict = dict([("sl", "s"), ("cl", "m"), ("el", "c"), ("al", "a")]) # All tree = pc.treecluster(np.array(rawData), transpose=0, method=mDict[method], dist=dDict[distance]) clusterListAll = tree.cut(noClust) silScore = metrics.silhouette_score(rawData, clusterListAll, metric='euclidean') silhouetteList.append(silScore) # Single clusterListSingle = [] for i in range(0, len(labelList)): tree = pc.treecluster(np.array(rawData)[:, (rawVph * i):(rawVph * i) + rawVph], transpose=0, method=mDict[method], dist=dDict[distance]) cluterListTemp = tree.cut(noClust)