def profile(args): intervals = [x.strip() for x in args.intervals.split(",")] datafiles = [x.strip() for x in args.datafiles] annotation = args.annotation outfile = args.outfile colors = parse_colors(args.colors) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x) pysam.index(x) trackgroups = process_groups(args.trackgroups) if not trackgroups: trackgroups = [[x] for x in range(1, len(datafiles) + 1)] scalegroups = process_groups(args.scalegroups) scale = args.scale if scale == "auto": scale = True elif scale == "off": scale = False elif scale: try: scale = [int(x) for x in scale.split(",")] except: print "Error in scale argument" sys.exit(1) if trackgroups and scalegroups: if len(trackgroups) != sum([len(x) for x in scalegroups]): sys.stderr.write("Track groups and scales do not match!\n") sys.exit() # Group the tracks according to track_groups tracks = [] for group in trackgroups: tracks.append([datafiles[i - 1] for i in group]) # Intervals intervals = [split_interval(x) for x in intervals] # Create the image profile_screenshot(outfile, intervals, tracks, annotation=annotation, scalegroups=scalegroups, fontsize=args.textfontsize, colors=colors, bgmode=args.background, fragmentsize=args.fragmentsize, scale=scale, rmdup=args.rmdup, rmrepeats=args.rmrepeats, reverse=args.reverse )
def profile(args): interval = args.interval datafiles = [x.strip() for x in args.datafiles] annotation = args.annotation outfile = args.outfile colors = parse_colors(args.colors) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print( "Data file '{0}' does not have an index file. Creating an index file for {0}." .format(x)) pysam.index(x) trackgroups = process_groups(args.trackgroups) if not trackgroups: trackgroups = [[x] for x in range(1, len(datafiles) + 1)] scalegroups = process_groups(args.scalegroups) scale = args.scale if scale: try: scale = [float(x) for x in scale.split(",")] except Exception: print("Error in scale argument") sys.exit(1) if trackgroups and scalegroups: if len(trackgroups) != sum([len(x) for x in scalegroups]): sys.stderr.write("Track groups and scales do not match!\n") sys.exit() # Group the tracks according to track_groups tracks = [] for group in trackgroups: tracks.append([datafiles[i - 1] for i in group]) # Create the image profile_screenshot(outfile, interval, tracks, annotation=annotation, scalegroups=scalegroups, fontsize=args.textfontsize, colors=colors, bgmode=args.background, fragmentsize=args.fragmentsize, scale=scale, show_scale=args.show_scale, rmdup=args.rmdup, rmrepeats=args.rmrepeats, reverse=args.reverse, adjscale=args.adjscale)
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print( "Data file '{0}' does not have an index file. Creating an index file for {0}." .format(x)) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction.lower() dynam = args.graphdynamics fontsize = args.textfontsize colorbar = args.colorbar seed = args.seed # Check for mutually exclusive parameters if dynam: if merge_mirrored: print("ERROR: -m and -g option CANNOT be used together") sys.exit(1) if distancefunction == 'e': print( 'Dynamics can only be identified using Pearson correlation as metric.' ) print('Assigning metric to Pearson correlation') distancefunction = 'p' # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write( "You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = list(range(len(datafiles))) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["euclidean", "pearson"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: METRIC = distancefunction print("{} distance method".format(METRIC)) ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async(load_heatmap_data, args=(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([ norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick) ]) ind, labels = cluster_profile(clus, cluster_type=cluster_type, numclusters=args.numclusters, dist=METRIC, random_state=seed) if cluster_type == "k": if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in list(data.keys()): data[track][labels == j] = [ row[::-1] for row in data[track][labels == j] ] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip( array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format( chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format( regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format( readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print "ERROR: Data file '{0}' does not exist".format(x) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction[0].lower() dynam = args.graphdynamics fontsize = args.textfontsize # Check for mutually exclusive parameters if dynam: if merge_mirrored: print "ERROR: -m and -g option CANNOT be used together" sys.exit(1) if distancefunction == 'e': print 'Dynamics can only be identified using Pearson correlation as metric.' print 'Assigning metric to Pearson correlation' distancefunction = 'p' # Warning about too much files if (len(tracks) > 4): print "Warning: Running fluff with too many files might make you system use enormous amount of memory!" # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] else: pick = range(len(datafiles)) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["e", "p"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: if distancefunction == "e": METRIC = DEFAULT_METRIC print "Euclidean distance method" else: METRIC = "c" print "Pearson distance method" ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=[]): # Calculate the profile data data = {} regions = [] print "Loading data" try: # Load data in parallel import multiprocessing pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append(pool.apply_async(load_heatmap_data, args=( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except: sys.stderr.write("Python multiprocessing not installed, can't load data in parallel\n") for datafile in datafiles: track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin guard = [] if dynam: amount_bins = 1 extend_dyn_up = 1000 extend_dyn_down = 1000 # load the data once to get the features which extend below 0 guard = check_data(featurefile, guard, dynam, extend_dyn_up, extend_dyn_down) else: amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, DEFAULT_PERCENTILE) clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)]) # Clustering if cluster_type == "k": print "K-means clustering" ## K-means clustering # PyCluster labels, error, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC) if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in data.keys(): data[track][labels == j] = [row[::-1] for row in data[track][labels == j]] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) ind = labels.argsort() # Hierarchical clustering elif cluster_type == "h": print "Hierarchical clustering" tree = Pycluster.treecluster(clus, method="m", dist=METRIC) labels = tree.cut(args.numclusters) ind = sort_tree(tree, arange(len(regions))) else: ind = arange(len(regions)) labels = zeros(len(regions)) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for bin in row: if not bins: bins = '{0}'.format(bin) else: bins = '{0};{1}'.format(bins, bin) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t'.format(track)) for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
def bandplot(args): if (0 > args.scalar) or (args.scalar > 100): print "ERROR: -P value has to be between 0 and 100" sys.exit(1) else: scalar = args.scalar if not args.datafiles and not args.readCount: print 'You should provide data file(s) or the read counts file.' sys.exit() if args.datafiles and args.readCount: print 'You should choose only ONE option. Either data file(s) or the read counts file.' sys.exit() clust_file = args.clust_file if args.datafiles: for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x) pysam.index(x) datafiles = [x.strip() for x in args.datafiles] fragmentsize = args.fragmentsize colors = parse_colors(args.colors) scalegroups = process_groups(args.scalegroups) percs = [int(x) for x in args.percs.split(",")] rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats bins = args.bins summary = args.summary fontsize = args.textfontsize font = FontProperties(size=fontsize / 1.25, family=["Nimbus Sans L", "Helvetica", "sans-serif"]) # Calculate the profile data if args.datafiles: data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize) tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] else: titles, data = load_read_counts(args.readCount) tracks = titles for x in data: for i in data[x]: bins = len(data[x][i]) break break # Get cluster information cluster_data = load_bed_clusters(clust_file) clusters = cluster_data.keys() #Init x-axis t = arange(bins) rows = len(tracks) cols = len(clusters) if summary: rows += 1 cols += 1 # Get a figure with a lot of subplots fig, axes = create_grid_figure(rows, cols, plotwidth=PLOTWIDTH, plotheight=PLOTHEIGHT, padleft=PADLEFT, padtop=PADTOP, pad=PAD, padright=PADRIGHT, padbottom=PADBOTTOM) track_max = [] for track_num, track in enumerate(tracks): percentiles = [scoreatpercentile([data[track][x] for x in cluster_data[cluster]], scalar) for cluster in clusters] track_max.append(max(percentiles)) for track_num, track in enumerate(tracks): for i,cluster in enumerate(clusters): # Retrieve axes ax = axes[track_num][i] # Get the data vals = array([data[track][x] for x in cluster_data[cluster]]) # Make the plot coverage_plot(ax, t, vals, colors[track_num % len(colors)], percs) # Get scale max maxscale = track_max[track_num] if scalegroups and len(scalegroups) > 0: for group in scalegroups: if (track_num + 1) in group: maxscale = max([track_max[j - 1] for j in group]) break # Set scale ax.set_ylim(0, maxscale) ax.set_xlim(0, bins - 1) # Cluster titles if track_num == 0: ax.set_title("%s\nn=%s" % (cluster, len(cluster_data[cluster])), font_properties=font) # Track title and scale if i == 0: pos = axes[track_num][0].get_position().get_points() text_y = (pos[1][1] + pos[0][1]) / 2 text_x = pos[0][0] - (PAD / fig.get_figwidth()) plt.figtext(text_x, text_y, titles[track_num], clip_on=False, horizontalalignment="right", verticalalignment="center", font_properties=font) plt.figtext(text_x, pos[1][1], "%.4g" % maxscale, clip_on=False, horizontalalignment="right", verticalalignment="top", font_properties=font) plt.figtext(text_x, pos[0][1], 0, clip_on=False, horizontalalignment="right", verticalalignment="bottom", font_properties=font) if summary: for i,track in enumerate(tracks): ax = axes[i][cols - 1] l = len(clusters) min_alpha = 0.3 max_alpha = 0.9 if l > 1: step = (max_alpha - min_alpha) / (l - 1) alphas = arange(min_alpha, max_alpha + step, step) else: alphas = [max_alpha] for j,cluster in enumerate(clusters): vals = array([data[track][x] for x in cluster_data[cluster]]) m = median(vals, axis=0) ax.plot(arange(len(m)), m, color=colors[i % len(colors)], alpha=alphas[j]) ax.set_ylim(0, track_max[i]) for i,cluster in enumerate(clusters): ax = axes[rows - 1][i] max_max = 0 for j,track in enumerate(tracks): vals = array([data[track][x] for x in cluster_data[cluster]]) m = median(vals, axis=0) ax.plot(arange(len(m)), m, color=colors[j % len(colors)], alpha=0.8) if track_max[j] > max_max: max_max = track_max[j] ax.set_ylim(0, max_max) ax = axes[rows - 1][cols - 1] ax.set_frame_on(False) ax.axes.get_yaxis().set_visible(False) ax.axes.get_xaxis().set_visible(False) print "Saving figure" plt.savefig(args.outfile, dpi=600)
for opt in [options.clust_file, options.datafiles, options.outfile]: if not opt: parser.print_help() sys.exit() clust_file = options.clust_file for x in options.datafiles.split(","): if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file".format(x) print "Creating an index file for {0}".format(x) pysam.index(x) print "Done!" datafiles = [x.strip() for x in options.datafiles.split(",")] fragmentsize = options.fragmentsize tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(options.colors) scalegroups = process_groups(options.scalegroups) percs = [int(x) for x in options.percs.split(",")] rmdup = options.rmdup rpkm = options.rpkm rmrepeats = options.rmrepeats bins = options.bins summary = options.summary # Calculate the profile data data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize) # Get cluster information cluster_data = load_bed_clusters(clust_file) clusters = cluster_data.keys() #Init x-axis t = arange(bins) rows = len(tracks)
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print("Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction.lower() dynam = args.graphdynamics fontsize = args.textfontsize colorbar = args.colorbar seed = args.seed # Check for mutually exclusive parameters if dynam: if merge_mirrored: print("ERROR: -m and -g option CANNOT be used together") sys.exit(1) if distancefunction == 'e': print('Dynamics can only be identified using Pearson correlation as metric.') print('Assigning metric to Pearson correlation') distancefunction = 'p' # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write("You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = list(range(len(datafiles))) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["euclidean", "pearson"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: METRIC = distancefunction print("{} distance method".format(METRIC)) ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append(pool.apply_async(load_heatmap_data, args=( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)]) ind, labels = cluster_profile( clus, cluster_type=cluster_type, numclusters = args.numclusters, dist=METRIC, random_state=seed) if cluster_type == "k": if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in list(data.keys()): data[track][labels == j] = [row[::-1] for row in data[track][labels == j]] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
def bandplot(args): if (0 > args.scalar) or (args.scalar > 100): print "ERROR: -P value has to be between 0 and 100" sys.exit(1) else: scalar = args.scalar if not args.datafiles and not args.readCount: print 'You should provide data file(s) or the read counts file.' sys.exit() if args.datafiles and args.readCount: print 'You should choose only ONE option. Either data file(s) or the read counts file.' sys.exit() clust_file = args.clust_file if args.datafiles: for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x) pysam.index(x) datafiles = [x.strip() for x in args.datafiles] fragmentsize = args.fragmentsize colors = parse_colors(args.colors) scalegroups = process_groups(args.scalegroups) percs = [int(x) for x in args.percs.split(",")] rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats bins = args.bins summary = args.summary fontsize = args.textfontsize font = FontProperties(size=fontsize / 1.25, family=["Nimbus Sans L", "Helvetica", "sans-serif"]) # Calculate the profile data if args.datafiles: data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize) tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] else: titles, data = load_read_counts(args.readCount) tracks = titles for x in data: for i in data[x]: bins = len(data[x][i]) break break # Get cluster information cluster_data = load_bed_clusters(clust_file) clusters = cluster_data.keys() #Init x-axis t = np.arange(bins) rows = len(tracks) cols = len(clusters) if summary: rows += 1 cols += 1 # Get a figure with a lot of subplots fig, axes = create_grid_figure(rows, cols, plotwidth=cfg.PLOTWIDTH, plotheight=cfg.PLOTHEIGHT, padleft=cfg.PADLEFT, padtop=cfg.PADTOP, pad=cfg.PAD, padright=cfg.PADRIGHT, padbottom=cfg.PADBOTTOM) track_max = [] for track_num, track in enumerate(tracks): percentiles = [scoreatpercentile([data[track][x] for x in cluster_data[cluster]], scalar) for cluster in clusters] track_max.append(max(percentiles)) for track_num, track in enumerate(tracks): for i,cluster in enumerate(clusters): # Retrieve axes ax = axes[track_num][i] # Get the data vals = np.array([data[track][x] for x in cluster_data[cluster]]) # Make the plot coverage_plot(ax, t, vals, colors[track_num % len(colors)], percs) # Get scale max maxscale = track_max[track_num] if scalegroups and len(scalegroups) > 0: for group in scalegroups: if (track_num + 1) in group: maxscale = max([track_max[j - 1] for j in group]) break # Set scale ax.set_ylim(0, maxscale) ax.set_xlim(0, bins - 1) # Cluster titles if track_num == 0: ax.set_title("%s\nn=%s" % (cluster, len(cluster_data[cluster])), font_properties=font) # Track title and scale if i == 0: pos = axes[track_num][0].get_position().get_points() text_y = (pos[1][1] + pos[0][1]) / 2 text_x = pos[0][0] - (cfg.PAD / fig.get_figwidth()) plt.figtext(text_x, text_y, titles[track_num], clip_on=False, horizontalalignment="right", verticalalignment="center", font_properties=font) plt.figtext(text_x, pos[1][1], "%.4g" % maxscale, clip_on=False, horizontalalignment="right", verticalalignment="top", font_properties=font) plt.figtext(text_x, pos[0][1], 0, clip_on=False, horizontalalignment="right", verticalalignment="bottom", font_properties=font) if summary: for i,track in enumerate(tracks): ax = axes[i][cols - 1] l = len(clusters) min_alpha = 0.3 max_alpha = 0.9 if l > 1: step = (max_alpha - min_alpha) / (l - 1) alphas = np.arange(min_alpha, max_alpha + step, step) else: alphas = [max_alpha] for j,cluster in enumerate(clusters): vals = np.array([data[track][x] for x in cluster_data[cluster]]) m = np.median(vals, axis=0) ax.plot(np.arange(len(m)), m, color=colors[i % len(colors)], alpha=alphas[j]) ax.set_ylim(0, track_max[i]) for i,cluster in enumerate(clusters): ax = axes[rows - 1][i] max_max = 0 for j,track in enumerate(tracks): vals = np.array([data[track][x] for x in cluster_data[cluster]]) m = np.median(vals, axis=0) ax.plot(np.arange(len(m)), m, color=colors[j % len(colors)], alpha=0.8) if track_max[j] > max_max: max_max = track_max[j] ax.set_ylim(0, max_max) ax = axes[rows - 1][cols - 1] ax.set_frame_on(False) ax.axes.get_yaxis().set_visible(False) ax.axes.get_xaxis().set_visible(False) print "Saving figure" plt.savefig(args.outfile, dpi=600)