Example #1
0
def profile(args):
    intervals = [x.strip() for x in args.intervals.split(",")]
    datafiles = [x.strip() for x in args.datafiles]
    annotation = args.annotation
    outfile = args.outfile
    colors = parse_colors(args.colors)

    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)
            pysam.index(x)

    trackgroups = process_groups(args.trackgroups)
    if not trackgroups:
        trackgroups = [[x] for x in range(1, len(datafiles) + 1)]

    scalegroups = process_groups(args.scalegroups)
    scale = args.scale
    if scale == "auto":
        scale = True
    elif scale == "off":
        scale = False
    elif scale:
        try:
            scale = [int(x) for x in scale.split(",")]
        except:
            print "Error in scale argument"
            sys.exit(1)

    if trackgroups and scalegroups:
        if len(trackgroups) != sum([len(x) for x in scalegroups]):
            sys.stderr.write("Track groups and scales do not match!\n")
            sys.exit()

    # Group the tracks according to track_groups
    tracks = []
    for group in trackgroups:
        tracks.append([datafiles[i - 1] for i in group])

    # Intervals
    intervals = [split_interval(x) for x in intervals]

    # Create the image
    profile_screenshot(outfile, intervals, tracks,
                       annotation=annotation,
                       scalegroups=scalegroups,
                       fontsize=args.textfontsize,
                       colors=colors,
                       bgmode=args.background,
                       fragmentsize=args.fragmentsize,
                       scale=scale,
                       rmdup=args.rmdup,
                       rmrepeats=args.rmrepeats,
                       reverse=args.reverse
                       )
Example #2
0
def profile(args):
    interval = args.interval
    datafiles = [x.strip() for x in args.datafiles]
    annotation = args.annotation
    outfile = args.outfile
    colors = parse_colors(args.colors)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print(
                "Data file '{0}' does not have an index file. Creating an index file for {0}."
                .format(x))
            pysam.index(x)

    trackgroups = process_groups(args.trackgroups)
    if not trackgroups:
        trackgroups = [[x] for x in range(1, len(datafiles) + 1)]

    scalegroups = process_groups(args.scalegroups)
    scale = args.scale
    if scale:
        try:
            scale = [float(x) for x in scale.split(",")]
        except Exception:
            print("Error in scale argument")
            sys.exit(1)

    if trackgroups and scalegroups:
        if len(trackgroups) != sum([len(x) for x in scalegroups]):
            sys.stderr.write("Track groups and scales do not match!\n")
            sys.exit()

    # Group the tracks according to track_groups
    tracks = []
    for group in trackgroups:
        tracks.append([datafiles[i - 1] for i in group])

    # Create the image
    profile_screenshot(outfile,
                       interval,
                       tracks,
                       annotation=annotation,
                       scalegroups=scalegroups,
                       fontsize=args.textfontsize,
                       colors=colors,
                       bgmode=args.background,
                       fragmentsize=args.fragmentsize,
                       scale=scale,
                       show_scale=args.show_scale,
                       rmdup=args.rmdup,
                       rmrepeats=args.rmrepeats,
                       reverse=args.reverse,
                       adjscale=args.adjscale)
Example #3
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print(
                "Data file '{0}' does not have an index file. Creating an index file for {0}."
                .format(x))
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction.lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize
    colorbar = args.colorbar
    seed = args.seed

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print("ERROR: -m and -g option CANNOT be used together")
            sys.exit(1)
        if distancefunction == 'e':
            print(
                'Dynamics can only be identified using Pearson correlation as metric.'
            )
            print('Assigning metric to Pearson correlation')
            distancefunction = 'p'

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write(
                "You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = list(range(len(datafiles)))

    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["euclidean", "pearson"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        METRIC = distancefunction
        print("{} distance method".format(METRIC))
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile,
                  amount_bins,
                  extend_dyn_up,
                  extend_dyn_down,
                  rmdup,
                  rpkm,
                  rmrepeats,
                  fragmentsize,
                  dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print("Loading data")
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(
                    pool.apply_async(load_heatmap_data,
                                     args=(featurefile, datafile, amount_bins,
                                           extend_dyn_up, extend_dyn_down,
                                           rmdup, rpkm, rmrepeats,
                                           fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(
                    featurefile, datafile, amount_bins, extend_dyn_up,
                    extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize,
                    dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up,
                                     extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([
        norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)
    ])

    ind, labels = cluster_profile(clus,
                                  cluster_type=cluster_type,
                                  numclusters=args.numclusters,
                                  dist=METRIC,
                                  random_state=seed)

    if cluster_type == "k":
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in list(data.keys()):
                    data[track][labels == j] = [
                        row[::-1] for row in data[track][labels == j]
                    ]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up,
                                         extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(
            array(regions, dtype="object")[ind],
            array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(
                chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
                chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(
                regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(
                    readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors,
                 scale, tscale, labels, fontsize, colorbar)
Example #4
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print "ERROR: Data file '{0}' does not exist".format(x)
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction[0].lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print "ERROR: -m and -g option CANNOT be used together"
            sys.exit(1)
        if distancefunction == 'e':
            print 'Dynamics can only be identified using Pearson correlation as metric.'
            print 'Assigning metric to Pearson correlation'
            distancefunction = 'p'

    # Warning about too much files
    if (len(tracks) > 4):
        print "Warning: Running fluff with too many files might make you system use enormous amount of memory!"
    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
    else:
        pick = range(len(datafiles))
    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["e", "p"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        if distancefunction == "e":
            METRIC = DEFAULT_METRIC
            print "Euclidean distance method"
        else:
            METRIC = "c"
            print "Pearson distance method"
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam,
                  guard=[]):
        # Calculate the profile data
        data = {}
        regions = []
        print "Loading data"
        try:
            # Load data in parallel
            import multiprocessing
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(pool.apply_async(load_heatmap_data, args=(
                featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except:
            sys.stderr.write("Python multiprocessing not installed, can't load data in parallel\n")
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up,
                                                                   extend_dyn_down, rmdup, rpkm, rmrepeats,
                                                                   fragmentsize, dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    guard = []
    if dynam:
        amount_bins = 1
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, guard, dynam, extend_dyn_up, extend_dyn_down)
    else:
        amount_bins = bins
        extend_dyn_up = extend_up
        extend_dyn_down = extend_down

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)
    # Normalize
    norm_data = normalize_data(data, DEFAULT_PERCENTILE)

    clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)])

    # Clustering
    if cluster_type == "k":
        print "K-means clustering"
        ## K-means clustering
        # PyCluster
        labels, error, nfound = Pycluster.kcluster(clus, args.numclusters, dist=METRIC)
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in data.keys():
                    data[track][labels == j] = [row[::-1] for row in data[track][labels == j]]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

        ind = labels.argsort()

        # Hierarchical clustering
    elif cluster_type == "h":
        print "Hierarchical clustering"
        tree = Pycluster.treecluster(clus, method="m", dist=METRIC)
        labels = tree.cut(args.numclusters)
        ind = sort_tree(tree, arange(len(regions)))
    else:
        ind = arange(len(regions))
        labels = zeros(len(regions))


    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for bin in row:
                if not bins:
                    bins = '{0}'.format(bin)
                else:
                    bins = '{0};{1}'.format(bins, bin)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t'.format(track))
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize)
Example #5
0
def bandplot(args):
    if (0 > args.scalar) or (args.scalar > 100):
      print "ERROR: -P value has to be between 0 and 100"
      sys.exit(1)
    else:
      scalar = args.scalar

    if not args.datafiles and not args.readCount:
        print 'You should provide data file(s) or the read counts file.'
        sys.exit()
    if args.datafiles and args.readCount:
        print 'You should choose only ONE option. Either data file(s) or the read counts file.'
        sys.exit()


    clust_file = args.clust_file

    if args.datafiles:
        for x in args.datafiles:
          if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
              print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)
              pysam.index(x)
        datafiles = [x.strip() for x in args.datafiles]


    fragmentsize = args.fragmentsize
    colors = parse_colors(args.colors)
    scalegroups = process_groups(args.scalegroups)
    percs = [int(x) for x in args.percs.split(",")]
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    bins = args.bins
    summary = args.summary
    fontsize = args.textfontsize
    font = FontProperties(size=fontsize / 1.25, family=["Nimbus Sans L", "Helvetica", "sans-serif"])
    # Calculate the profile data
    if args.datafiles:
        data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize)
        tracks = [os.path.basename(x) for x in datafiles]
        titles = [os.path.splitext(x)[0] for x in tracks]
    else:
        titles, data = load_read_counts(args.readCount)
        tracks = titles
        for x in data:
            for i in data[x]:
                bins = len(data[x][i])
                break
            break

    # Get cluster information
    cluster_data = load_bed_clusters(clust_file)
    clusters = cluster_data.keys()
    #Init x-axis
    t = arange(bins)
    rows = len(tracks)
    cols = len(clusters)
    if summary:
        rows += 1
        cols += 1
    # Get a figure with a lot of subplots
    fig, axes = create_grid_figure(rows, cols, plotwidth=PLOTWIDTH, plotheight=PLOTHEIGHT, padleft=PADLEFT, padtop=PADTOP, pad=PAD, padright=PADRIGHT, padbottom=PADBOTTOM)
    track_max = []
    for track_num, track in enumerate(tracks):
        percentiles = [scoreatpercentile([data[track][x] for x in cluster_data[cluster]], scalar) for cluster in clusters]
        track_max.append(max(percentiles))
    for track_num, track in enumerate(tracks):
        for i,cluster in enumerate(clusters):
            # Retrieve axes
            ax = axes[track_num][i]
            # Get the data
            vals = array([data[track][x] for x in cluster_data[cluster]])
            # Make the plot
            coverage_plot(ax, t, vals, colors[track_num % len(colors)], percs)
            # Get scale max
            maxscale = track_max[track_num]
            if scalegroups and len(scalegroups) > 0:
                for group in scalegroups:
                    if (track_num + 1) in group:
                        maxscale = max([track_max[j - 1] for j in group])
                        break
            # Set scale
            ax.set_ylim(0, maxscale)
            ax.set_xlim(0, bins - 1)
            # Cluster titles
            if track_num == 0:
                ax.set_title("%s\nn=%s" % (cluster, len(cluster_data[cluster])), font_properties=font)
            # Track title and scale
            if i == 0:
                pos = axes[track_num][0].get_position().get_points()
                text_y = (pos[1][1] + pos[0][1]) / 2
                text_x = pos[0][0] - (PAD / fig.get_figwidth())
                plt.figtext(text_x, text_y, titles[track_num], clip_on=False, horizontalalignment="right", verticalalignment="center", font_properties=font)
                plt.figtext(text_x,  pos[1][1], "%.4g" % maxscale, clip_on=False, horizontalalignment="right", verticalalignment="top", font_properties=font)
                plt.figtext(text_x,  pos[0][1], 0, clip_on=False, horizontalalignment="right", verticalalignment="bottom", font_properties=font)
    if summary:
        for i,track in enumerate(tracks):
            ax = axes[i][cols - 1]
            l = len(clusters)
            min_alpha = 0.3
            max_alpha = 0.9
            if l > 1:
                step = (max_alpha - min_alpha) / (l - 1)
                alphas = arange(min_alpha, max_alpha + step, step)
            else:
                alphas = [max_alpha]
            for j,cluster in enumerate(clusters):
                vals = array([data[track][x] for x in cluster_data[cluster]])
                m = median(vals, axis=0)
                ax.plot(arange(len(m)), m, color=colors[i % len(colors)], alpha=alphas[j])
            ax.set_ylim(0, track_max[i])
        for i,cluster in enumerate(clusters):
            ax = axes[rows - 1][i]
            max_max = 0
            for j,track in enumerate(tracks):
                vals = array([data[track][x] for x in cluster_data[cluster]])
                m = median(vals, axis=0)
                ax.plot(arange(len(m)), m, color=colors[j % len(colors)], alpha=0.8)
                if track_max[j] > max_max:
                    max_max = track_max[j]
            ax.set_ylim(0, max_max)
        ax = axes[rows - 1][cols - 1]
        ax.set_frame_on(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.axes.get_xaxis().set_visible(False)
    print "Saving figure"
    plt.savefig(args.outfile, dpi=600)
Example #6
0
for opt in [options.clust_file, options.datafiles, options.outfile]:
    if not opt:
        parser.print_help()
        sys.exit()
clust_file = options.clust_file
for x in options.datafiles.split(","):
  if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
    print "Data file '{0}' does not have an index file".format(x)
    print "Creating an index file for {0}".format(x)
    pysam.index(x)
    print "Done!"
datafiles = [x.strip() for x in options.datafiles.split(",")]
fragmentsize = options.fragmentsize
tracks = [os.path.basename(x) for x in datafiles]
titles = [os.path.splitext(x)[0] for x in tracks]
colors = parse_colors(options.colors)
scalegroups = process_groups(options.scalegroups)
percs = [int(x) for x in options.percs.split(",")]
rmdup = options.rmdup
rpkm = options.rpkm
rmrepeats = options.rmrepeats
bins = options.bins
summary = options.summary
# Calculate the profile data
data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize)
# Get cluster information
cluster_data = load_bed_clusters(clust_file)
clusters = cluster_data.keys()
#Init x-axis
t = arange(bins)
rows = len(tracks)
Example #7
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print("Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x))
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction.lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize
    colorbar = args.colorbar
    seed = args.seed

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print("ERROR: -m and -g option CANNOT be used together")
            sys.exit(1)
        if distancefunction == 'e':
            print('Dynamics can only be identified using Pearson correlation as metric.')
            print('Assigning metric to Pearson correlation')
            distancefunction = 'p'

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write("You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = list(range(len(datafiles)))


    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["euclidean", "pearson"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        METRIC = distancefunction
        print("{} distance method".format(METRIC))
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print("Loading data")
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(pool.apply_async(load_heatmap_data, args=(
                featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up,
                                                                   extend_dyn_down, rmdup, rpkm, rmrepeats,
                                                                   fragmentsize, dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm,
                                         rmrepeats,
                                         fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)])

    ind, labels = cluster_profile(
            clus, 
            cluster_type=cluster_type, 
            numclusters = args.numclusters, 
            dist=METRIC,
            random_state=seed)
    
    if cluster_type == "k":
       if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in list(data.keys()):
                    data[track][labels == j] = [row[::-1] for row in data[track][labels == j]]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
Example #8
0
def bandplot(args):
    if (0 > args.scalar) or (args.scalar > 100):
      print "ERROR: -P value has to be between 0 and 100"
      sys.exit(1)
    else:
      scalar = args.scalar

    if not args.datafiles and not args.readCount:
        print 'You should provide data file(s) or the read counts file.'
        sys.exit()
    if args.datafiles and args.readCount:
        print 'You should choose only ONE option. Either data file(s) or the read counts file.'
        sys.exit()


    clust_file = args.clust_file

    if args.datafiles:
        for x in args.datafiles:
          if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
              print "Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x)
              pysam.index(x)
        datafiles = [x.strip() for x in args.datafiles]


    fragmentsize = args.fragmentsize
    colors = parse_colors(args.colors)
    scalegroups = process_groups(args.scalegroups)
    percs = [int(x) for x in args.percs.split(",")]
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    bins = args.bins
    summary = args.summary
    fontsize = args.textfontsize
    font = FontProperties(size=fontsize / 1.25, family=["Nimbus Sans L", "Helvetica", "sans-serif"])
    # Calculate the profile data
    if args.datafiles:
        data = load_cluster_data(clust_file, datafiles, bins, rpkm, rmdup, rmrepeats, fragmentsize=fragmentsize)
        tracks = [os.path.basename(x) for x in datafiles]
        titles = [os.path.splitext(x)[0] for x in tracks]
    else:
        titles, data = load_read_counts(args.readCount)
        tracks = titles
        for x in data:
            for i in data[x]:
                bins = len(data[x][i])
                break
            break

    # Get cluster information
    cluster_data = load_bed_clusters(clust_file)
    clusters = cluster_data.keys()
    #Init x-axis
    t = np.arange(bins)
    rows = len(tracks)
    cols = len(clusters)
    if summary:
        rows += 1
        cols += 1
    # Get a figure with a lot of subplots
    fig, axes = create_grid_figure(rows, cols, 
            plotwidth=cfg.PLOTWIDTH, 
            plotheight=cfg.PLOTHEIGHT, 
            padleft=cfg.PADLEFT, 
            padtop=cfg.PADTOP, 
            pad=cfg.PAD, 
            padright=cfg.PADRIGHT, 
            padbottom=cfg.PADBOTTOM)
    track_max = []
    for track_num, track in enumerate(tracks):
        percentiles = [scoreatpercentile([data[track][x] for x in cluster_data[cluster]], scalar) for cluster in clusters]
        track_max.append(max(percentiles))
    for track_num, track in enumerate(tracks):
        for i,cluster in enumerate(clusters):
            # Retrieve axes
            ax = axes[track_num][i]
            # Get the data
            vals = np.array([data[track][x] for x in cluster_data[cluster]])
            # Make the plot
            coverage_plot(ax, t, vals, colors[track_num % len(colors)], percs)
            # Get scale max
            maxscale = track_max[track_num]
            if scalegroups and len(scalegroups) > 0:
                for group in scalegroups:
                    if (track_num + 1) in group:
                        maxscale = max([track_max[j - 1] for j in group])
                        break
            # Set scale
            ax.set_ylim(0, maxscale)
            ax.set_xlim(0, bins - 1)
            # Cluster titles
            if track_num == 0:
                ax.set_title("%s\nn=%s" % (cluster, len(cluster_data[cluster])), font_properties=font)
            # Track title and scale
            if i == 0:
                pos = axes[track_num][0].get_position().get_points()
                text_y = (pos[1][1] + pos[0][1]) / 2
                text_x = pos[0][0] - (cfg.PAD / fig.get_figwidth())
                plt.figtext(text_x, text_y, titles[track_num], clip_on=False, horizontalalignment="right", verticalalignment="center", font_properties=font)
                plt.figtext(text_x,  pos[1][1], "%.4g" % maxscale, clip_on=False, horizontalalignment="right", verticalalignment="top", font_properties=font)
                plt.figtext(text_x,  pos[0][1], 0, clip_on=False, horizontalalignment="right", verticalalignment="bottom", font_properties=font)
    if summary:
        for i,track in enumerate(tracks):
            ax = axes[i][cols - 1]
            l = len(clusters)
            min_alpha = 0.3
            max_alpha = 0.9
            if l > 1:
                step = (max_alpha - min_alpha) / (l - 1)
                alphas = np.arange(min_alpha, max_alpha + step, step)
            else:
                alphas = [max_alpha]
            for j,cluster in enumerate(clusters):
                vals = np.array([data[track][x] for x in cluster_data[cluster]])
                m = np.median(vals, axis=0)
                ax.plot(np.arange(len(m)), m, color=colors[i % len(colors)], alpha=alphas[j])
            ax.set_ylim(0, track_max[i])
        for i,cluster in enumerate(clusters):
            ax = axes[rows - 1][i]
            max_max = 0
            for j,track in enumerate(tracks):
                vals = np.array([data[track][x] for x in cluster_data[cluster]])
                m = np.median(vals, axis=0)
                ax.plot(np.arange(len(m)), m, color=colors[j % len(colors)], alpha=0.8)
                if track_max[j] > max_max:
                    max_max = track_max[j]
            ax.set_ylim(0, max_max)
        ax = axes[rows - 1][cols - 1]
        ax.set_frame_on(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.axes.get_xaxis().set_visible(False)
    print "Saving figure"
    plt.savefig(args.outfile, dpi=600)