def test_mirror_single_track(heatmap_data_single_track): from fluff.util import mirror_clusters data, labels = heatmap_data_single_track assert data["track1"][5][0] == 10 (i, j) = mirror_clusters(data, labels) assert i == 0 assert j == 1
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print( "Data file '{0}' does not have an index file. Creating an index file for {0}." .format(x)) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction.lower() dynam = args.graphdynamics fontsize = args.textfontsize colorbar = args.colorbar seed = args.seed # Check for mutually exclusive parameters if dynam: if merge_mirrored: print("ERROR: -m and -g option CANNOT be used together") sys.exit(1) if distancefunction == 'e': print( 'Dynamics can only be identified using Pearson correlation as metric.' ) print('Assigning metric to Pearson correlation') distancefunction = 'p' # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write( "You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = list(range(len(datafiles))) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["euclidean", "pearson"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: METRIC = distancefunction print("{} distance method".format(METRIC)) ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append( pool.apply_async(load_heatmap_data, args=(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([ norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick) ]) ind, labels = cluster_profile(clus, cluster_type=cluster_type, numclusters=args.numclusters, dist=METRIC, random_state=seed) if cluster_type == "k": if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in list(data.keys()): data[track][labels == j] = [ row[::-1] for row in data[track][labels == j] ] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip( array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format( chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format( regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format( readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
def heatmap(args): datafiles = args.datafiles for x in args.datafiles: if not os.path.isfile(x): print("ERROR: Data file '{0}' does not exist".format(x)) sys.exit(1) for x in args.datafiles: if '.bam' in x and not os.path.isfile("{0}.bai".format(x)): print("Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)) pysam.index(x) # Options Parser featurefile = args.featurefile datafiles = [x.strip() for x in args.datafiles] tracks = [os.path.basename(x) for x in datafiles] titles = [os.path.splitext(x)[0] for x in tracks] colors = parse_colors(args.colors) bgcolors = parse_colors(args.bgcolors) outfile = args.outfile extend_up = args.extend extend_down = args.extend fragmentsize = args.fragmentsize cluster_type = args.clustering[0].lower() merge_mirrored = args.merge_mirrored bins = (extend_up + extend_down) / args.binsize rmdup = args.rmdup rpkm = args.rpkm rmrepeats = args.rmrepeats ncpus = args.cpus distancefunction = args.distancefunction.lower() dynam = args.graphdynamics fontsize = args.textfontsize colorbar = args.colorbar seed = args.seed # Check for mutually exclusive parameters if dynam: if merge_mirrored: print("ERROR: -m and -g option CANNOT be used together") sys.exit(1) if distancefunction == 'e': print('Dynamics can only be identified using Pearson correlation as metric.') print('Assigning metric to Pearson correlation') distancefunction = 'p' # Method of clustering if (args.pick != None): pick = [i - 1 for i in split_ranges(args.pick)] if not all(i <= len(tracks) - 1 for i in pick): sys.stderr.write("You picked a non-existent file for clustering.\n") sys.exit(1) else: pick = list(range(len(datafiles))) if not cluster_type in ["k", "h", "n"]: sys.stderr.write("Unknown clustering type!\n") sys.exit(1) # Number of clusters if cluster_type == "k" and not args.numclusters >= 2: sys.stderr.write("Please provide number of clusters!\n") sys.exit(1) # Distance function if not distancefunction in ["euclidean", "pearson"]: sys.stderr.write("Unknown distance function!\n") sys.exit(1) else: METRIC = distancefunction print("{} distance method".format(METRIC)) ## Get scale for each track tscale = [1.0 for track in datafiles] # Function to load heatmap data def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard=None): if guard is None: guard = [] # Calculate the profile data data = {} regions = [] print("Loading data") try: # Load data in parallel pool = multiprocessing.Pool(processes=ncpus) jobs = [] for datafile in datafiles: jobs.append(pool.apply_async(load_heatmap_data, args=( featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard))) for job in jobs: track, regions, profile, guard = job.get() data[track] = profile except Exception as e: sys.stderr.write("Error loading data in parallel, trying serial\n") sys.stderr.write("Error: {}\n".format(e)) for datafile in datafiles: track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) data[track] = profile return data, regions, guard # -g : Option to try and get dynamics # Extend features 1kb up/down stream # Cluster them in one bin # Cluster them in one bin guard = [] amount_bins = bins extend_dyn_up = extend_up extend_dyn_down = extend_down if dynam: # load the data once to get the features which extend below 0 guard = check_data(featurefile, extend_dyn_up, extend_dyn_down) extend_dyn_up = 1000 extend_dyn_down = 1000 amount_bins = 1 # Load data for clustering data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) # Normalize norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE) clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)]) ind, labels = cluster_profile( clus, cluster_type=cluster_type, numclusters = args.numclusters, dist=METRIC, random_state=seed) if cluster_type == "k": if not dynam and merge_mirrored: (i, j) = mirror_clusters(data, labels) while j: for track in list(data.keys()): data[track][labels == j] = [row[::-1] for row in data[track][labels == j]] for k in range(len(regions)): if labels[k] == j: (chrom, start, end, gene, strand) = regions[k] if strand == "+": strand = "-" else: strand = "+" regions[k] = (chrom, start, end, gene, strand) n = len(set(labels)) labels[labels == j] = i for k in range(j + 1, n): labels[labels == k] = k - 1 (i, j) = mirror_clusters(data, labels) # Load data for visualization if -g option was used if dynam: data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam, guard) f = open("{0}_clusters.bed".format(outfile), "w") for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]): if not gene: f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand)) else: f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand)) f.close() # Save read counts readcounts = {} for i, track in enumerate(tracks): readcounts[track] = {} readcounts[track]['bins'] = [] for idx, row in enumerate(data[track]): bins = '' for b in row: if not bins: bins = '{0}'.format(b) else: bins = '{0};{1}'.format(bins, b) readcounts[track]['bins'].append(bins) input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w') input_fileBins.write('Regions\t') for i, track in enumerate(titles): input_fileBins.write('{0}\t'.format(track)) input_fileBins.write('\n') for i, track in enumerate(tracks): for idx in ind: input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2])) for i, track in enumerate(tracks): input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx])) input_fileBins.write('\n') break input_fileBins.close() if not cluster_type == "k": labels = None scale = get_absolute_scale(args.scale, [data[track] for track in tracks]) heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
def test_mirror_multiple_track(heatmap_data_multiple_tracks): from fluff.util import mirror_clusters data, labels = heatmap_data_multiple_tracks (i, j) = mirror_clusters(data, labels, 0.05) assert i == 1 assert j == 2