Beispiel #1
0
def test_mirror_single_track(heatmap_data_single_track):
    from fluff.util import mirror_clusters
    data, labels = heatmap_data_single_track
    assert data["track1"][5][0] == 10
    (i, j) = mirror_clusters(data, labels)
    assert i == 0
    assert j == 1
Beispiel #2
0
def test_mirror_single_track(heatmap_data_single_track):
    from fluff.util import mirror_clusters
    data, labels = heatmap_data_single_track
    assert data["track1"][5][0] == 10
    (i, j) = mirror_clusters(data, labels)
    assert i == 0
    assert j == 1
Beispiel #3
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print(
                "Data file '{0}' does not have an index file. Creating an index file for {0}."
                .format(x))
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction.lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize
    colorbar = args.colorbar
    seed = args.seed

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print("ERROR: -m and -g option CANNOT be used together")
            sys.exit(1)
        if distancefunction == 'e':
            print(
                'Dynamics can only be identified using Pearson correlation as metric.'
            )
            print('Assigning metric to Pearson correlation')
            distancefunction = 'p'

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write(
                "You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = list(range(len(datafiles)))

    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["euclidean", "pearson"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        METRIC = distancefunction
        print("{} distance method".format(METRIC))
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile,
                  amount_bins,
                  extend_dyn_up,
                  extend_dyn_down,
                  rmdup,
                  rpkm,
                  rmrepeats,
                  fragmentsize,
                  dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print("Loading data")
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(
                    pool.apply_async(load_heatmap_data,
                                     args=(featurefile, datafile, amount_bins,
                                           extend_dyn_up, extend_dyn_down,
                                           rmdup, rpkm, rmrepeats,
                                           fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(
                    featurefile, datafile, amount_bins, extend_dyn_up,
                    extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize,
                    dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up,
                                     extend_dyn_down, rmdup, rpkm, rmrepeats,
                                     fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([
        norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)
    ])

    ind, labels = cluster_profile(clus,
                                  cluster_type=cluster_type,
                                  numclusters=args.numclusters,
                                  dist=METRIC,
                                  random_state=seed)

    if cluster_type == "k":
        if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in list(data.keys()):
                    data[track][labels == j] = [
                        row[::-1] for row in data[track][labels == j]
                    ]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up,
                                         extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(
            array(regions, dtype="object")[ind],
            array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(
                chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
                chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(
                regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(
                    readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors,
                 scale, tscale, labels, fontsize, colorbar)
Beispiel #4
0
def heatmap(args):
    datafiles = args.datafiles
    for x in args.datafiles:
        if not os.path.isfile(x):
            print("ERROR: Data file '{0}' does not exist".format(x))
            sys.exit(1)
    for x in args.datafiles:
        if '.bam' in x and not os.path.isfile("{0}.bai".format(x)):
            print("Data file '{0}' does not have an index file. Creating an index file for {0}.".format(x))
            pysam.index(x)

    # Options Parser
    featurefile = args.featurefile
    datafiles = [x.strip() for x in args.datafiles]
    tracks = [os.path.basename(x) for x in datafiles]
    titles = [os.path.splitext(x)[0] for x in tracks]
    colors = parse_colors(args.colors)
    bgcolors = parse_colors(args.bgcolors)
    outfile = args.outfile
    extend_up = args.extend
    extend_down = args.extend
    fragmentsize = args.fragmentsize
    cluster_type = args.clustering[0].lower()
    merge_mirrored = args.merge_mirrored
    bins = (extend_up + extend_down) / args.binsize
    rmdup = args.rmdup
    rpkm = args.rpkm
    rmrepeats = args.rmrepeats
    ncpus = args.cpus
    distancefunction = args.distancefunction.lower()
    dynam = args.graphdynamics
    fontsize = args.textfontsize
    colorbar = args.colorbar
    seed = args.seed

    # Check for mutually exclusive parameters
    if dynam:
        if merge_mirrored:
            print("ERROR: -m and -g option CANNOT be used together")
            sys.exit(1)
        if distancefunction == 'e':
            print('Dynamics can only be identified using Pearson correlation as metric.')
            print('Assigning metric to Pearson correlation')
            distancefunction = 'p'

    # Method of clustering
    if (args.pick != None):
        pick = [i - 1 for i in split_ranges(args.pick)]
        if not all(i <= len(tracks) - 1 for i in pick):
            sys.stderr.write("You picked a non-existent file for clustering.\n")
            sys.exit(1)
    else:
        pick = list(range(len(datafiles)))


    if not cluster_type in ["k", "h", "n"]:
        sys.stderr.write("Unknown clustering type!\n")
        sys.exit(1)
    # Number of clusters
    if cluster_type == "k" and not args.numclusters >= 2:
        sys.stderr.write("Please provide number of clusters!\n")
        sys.exit(1)
    # Distance function
    if not distancefunction in ["euclidean", "pearson"]:
        sys.stderr.write("Unknown distance function!\n")
        sys.exit(1)
    else:
        METRIC = distancefunction
        print("{} distance method".format(METRIC))
    ## Get scale for each track
    tscale = [1.0 for track in datafiles]

    # Function to load heatmap data
    def load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats, fragmentsize, dynam,
                  guard=None):
        if guard is None:
            guard = []
        # Calculate the profile data
        data = {}
        regions = []
        print("Loading data")
        try:
            # Load data in parallel
            pool = multiprocessing.Pool(processes=ncpus)
            jobs = []
            for datafile in datafiles:
                jobs.append(pool.apply_async(load_heatmap_data, args=(
                featurefile, datafile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm, rmrepeats,
                fragmentsize, dynam, guard)))
            for job in jobs:
                track, regions, profile, guard = job.get()
                data[track] = profile
        except Exception as e:
            sys.stderr.write("Error loading data in parallel, trying serial\n")
            sys.stderr.write("Error: {}\n".format(e))
            for datafile in datafiles:
                track, regions, profile, guard = load_heatmap_data(featurefile, datafile, amount_bins, extend_dyn_up,
                                                                   extend_dyn_down, rmdup, rpkm, rmrepeats,
                                                                   fragmentsize, dynam, guard)
                data[track] = profile
        return data, regions, guard

    # -g : Option to try and get dynamics
    # Extend features 1kb up/down stream
    # Cluster them in one bin
    # Cluster them in one bin
    guard = []
    amount_bins = bins
    extend_dyn_up = extend_up
    extend_dyn_down = extend_down
    if dynam:
        # load the data once to get the features which extend below 0
        guard = check_data(featurefile, extend_dyn_up, extend_dyn_down)
        extend_dyn_up = 1000
        extend_dyn_down = 1000
        amount_bins = 1

    # Load data for clustering
    data, regions, guard = load_data(featurefile, amount_bins, extend_dyn_up, extend_dyn_down, rmdup, rpkm,
                                         rmrepeats,
                                         fragmentsize, dynam, guard)

    # Normalize
    norm_data = normalize_data(data, cfg.DEFAULT_PERCENTILE)

    clus = hstack([norm_data[t] for i, t in enumerate(tracks) if (not pick or i in pick)])

    ind, labels = cluster_profile(
            clus, 
            cluster_type=cluster_type, 
            numclusters = args.numclusters, 
            dist=METRIC,
            random_state=seed)
    
    if cluster_type == "k":
       if not dynam and merge_mirrored:
            (i, j) = mirror_clusters(data, labels)
            while j:
                for track in list(data.keys()):
                    data[track][labels == j] = [row[::-1] for row in data[track][labels == j]]
                for k in range(len(regions)):
                    if labels[k] == j:
                        (chrom, start, end, gene, strand) = regions[k]
                        if strand == "+":
                            strand = "-"
                        else:
                            strand = "+"
                        regions[k] = (chrom, start, end, gene, strand)
                n = len(set(labels))
                labels[labels == j] = i
                for k in range(j + 1, n):
                    labels[labels == k] = k - 1
                (i, j) = mirror_clusters(data, labels)

    # Load data for visualization if -g option was used
    if dynam:
        data, regions, guard = load_data(featurefile, bins, extend_up, extend_down, rmdup, rpkm, rmrepeats,
                                         fragmentsize, dynam, guard)

    f = open("{0}_clusters.bed".format(outfile), "w")
    for (chrom, start, end, gene, strand), cluster in zip(array(regions, dtype="object")[ind], array(labels)[ind]):
        if not gene:
            f.write("{0}\t{1}\t{2}\t.\t{3}\t{4}\n".format(chrom, start, end, cluster + 1, strand))
        else:
            f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(chrom, start, end, gene, cluster + 1, strand))
    f.close()
    # Save read counts
    readcounts = {}
    for i, track in enumerate(tracks):
        readcounts[track] = {}
        readcounts[track]['bins'] = []
        for idx, row in enumerate(data[track]):
            bins = ''
            for b in row:
                if not bins:
                    bins = '{0}'.format(b)
                else:
                    bins = '{0};{1}'.format(bins, b)
            readcounts[track]['bins'].append(bins)

    input_fileBins = open('{0}_readCounts.txt'.format(outfile), 'w')
    input_fileBins.write('Regions\t')
    for i, track in enumerate(titles):
        input_fileBins.write('{0}\t'.format(track))
    input_fileBins.write('\n')
    for i, track in enumerate(tracks):
        for idx in ind:
            input_fileBins.write('{0}:{1}-{2}\t'.format(regions[idx][0], regions[idx][1], regions[idx][2]))
            for i, track in enumerate(tracks):
                input_fileBins.write('{0}\t'.format(readcounts[track]['bins'][idx]))
            input_fileBins.write('\n')
        break
    input_fileBins.close()

    if not cluster_type == "k":
        labels = None

    scale = get_absolute_scale(args.scale, [data[track] for track in tracks])
    heatmap_plot(data, ind[::-1], outfile, tracks, titles, colors, bgcolors, scale, tscale, labels, fontsize, colorbar)
Beispiel #5
0
def test_mirror_multiple_track(heatmap_data_multiple_tracks):
    from fluff.util import mirror_clusters
    data, labels = heatmap_data_multiple_tracks
    (i, j) = mirror_clusters(data, labels, 0.05)
    assert i == 1
    assert j == 2
Beispiel #6
0
def test_mirror_multiple_track(heatmap_data_multiple_tracks):
    from fluff.util import mirror_clusters
    data, labels = heatmap_data_multiple_tracks
    (i, j) = mirror_clusters(data, labels, 0.05)
    assert i == 1
    assert j == 2