def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  the scores

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bwfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single bigWig file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    num_reads_per_bin = score_bw.getScorePerBin(
        args.bwfiles,
        args.binSize,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    np.savez_compressed(args.outFileName,
                        matrix=num_reads_per_bin,
                        labels=args.labels)

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        # import ipdb;ipdb.set_trace()
        with open(args.outRawCounts.name, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(header + content)

        """
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  the scores

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bwfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single bigWig file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    num_reads_per_bin = score_bw.getScorePerBin(
        args.bwfiles,
        args.binSize,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    np.savez_compressed(args.outFileName,
                        matrix=num_reads_per_bin,
                        labels=args.labels)

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        # import ipdb;ipdb.set_trace()
        with open(args.outRawCounts.name, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(header + content)
        """
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  the scores

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bwfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single bigWig file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    # Preload deepBlue files, which need to then be deleted
    deepBlueFiles = []
    for idx, fname in enumerate(args.bwfiles):
        if db.isDeepBlue(fname):
            deepBlueFiles.append([fname, idx])
    if len(deepBlueFiles) > 0:
        sys.stderr.write(
            "Preloading the following deepBlue files: {}\n".format(",".join(
                [x[0] for x in deepBlueFiles])))
        if 'BED' in args:
            regs = db.makeRegions(args.BED, args)
        else:
            foo = db.deepBlue(deepBlueFiles[0][0],
                              url=args.deepBlueURL,
                              userKey=args.userKey)
            regs = db.makeTiles(foo, args)
            del foo
        for x in deepBlueFiles:
            x.extend([args, regs])
        if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1:
            pool = multiprocessing.Pool(args.numberOfProcessors)
            res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999)
        else:
            res = list(map(db.preloadWrapper, deepBlueFiles))

        # substitute the file names with the temp files
        for (ftuple, r) in zip(deepBlueFiles, res):
            args.bwfiles[ftuple[1]] = r
        deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles]
        del regs

    num_reads_per_bin = score_bw.getScorePerBin(
        args.bwfiles,
        args.binSize,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts,
        allArgs=args)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    f = open(args.outFileName, "wb")
    np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels)
    f.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, "r+")
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        """
        if bed_regions:
            bed_regions.seek(0)
            reg_list = bed_regions.readlines()
            args.outRawCounts.write("#'chr'\t'start'\t'end'\t")
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n"
            for idx, row in enumerate(num_reads_per_bin):
                args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3]))
                args.outRawCounts.write(fmt % tuple(row))

        else:
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n"
            for row in num_reads_per_bin:
                args.outRawCounts.write(fmt.format(*tuple(row)))
        """
        f.close()

    # Clean up temporary bigWig files, if applicable
    if not args.deepBlueKeepTemp:
        for k, v in deepBlueFiles:
            os.remove(args.bwfiles[v])
    else:
        for k, v in deepBlueFiles:
            print("{} is stored in {}".format(k, args.bwfiles[v]))
Beispiel #4
0
def get_labels_and_correlation(
        bw_files,
        # chrs_to_skip,
        bin_size=10000,
        method='pearson',
        fileset_name='result',
        blacklist=None,
        labels=bw_labels,
        output_dir=BASE_DIR):
    my_listUnnested = []
    # my_labels_list = []
    assert method in ('pearson', 'spearman'), 'Invalid correlation method'
    # Autogenerate labels from filename if not provided
    if not labels:
        labels = [
            filename.split('/')[-1].split('.')[0] for filename in bw_files
        ]
    # Generate a name for the unique combination
    test_name = fileset_name + '_' + method
    if blacklist:
        blacklist_title = 'Blacklisted'
        test_name += '_blacklisted'
    else:
        blacklist_title = ''
    image_name = test_name + '.png'
    # Bin the bigwig data in 10kb increments
    num_reads_per_bin = score_bw.getScorePerBin(
        bw_files,
        bin_size,
        # chrsToSkip=chrs_to_skip,
        blackListFileName=blacklist)
    # Write to npz file
    print("right before npz")
    os.system('pwd')
    os.system('ls -lat')
    print('svo')
    print(output_dir)
    print(test_name)
    filename = output_dir + test_name + '.npz'
    print(filename)
    print('svo.. ')
    with open(filename, "wb") as f:
        np.savez_compressed(f, matrix=num_reads_per_bin, labels=labels)
    # Compute the correlations
    corr = Correlation(filename, method, labels=labels)
    np_array = corr.compute_correlation()
    print("ALERT CORR")
    print(np_array)

    listNested = np_array.tolist()

    def removeNestings(listNest):
        for i in listNest:
            if type(i) == list:
                print(str(i) + "is: " + str(type(i)))
                removeNestings(i)
                print(str(i) + "is after: " + str(type(i)))
            else:
                print(str(i) + "is finally " + str(type(i)))
                my_listUnnested.append(i)

    removeNestings(listNested)

    print("FINAL CORRELATION VALUES")
    print(my_listUnnested)

    with open("corrScores.txt", "w") as f:
        f.write(str(my_listUnnested))

    plot_title = '{}{} Correlation of {}'.format(blacklist_title,
                                                 method.capitalize(),
                                                 fileset_name)
    # Create a png file of correlation heatmap
    image_path = output_dir + image_name
    corr.plot_correlation(image_path, plot_title=plot_title)

    # return np_ar
    my_labels_list = labels
    return image_path, my_labels_list, my_listUnnested
Beispiel #5
0
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  the scores

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bwfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single bigWig file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    num_reads_per_bin = score_bw.getScorePerBin(
        args.bwfiles,
        args.binSize,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts,
        allArgs=args)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    f = open(args.outFileName, "wb")
    np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels)
    f.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, "r+")
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)
        """
        if bed_regions:
            bed_regions.seek(0)
            reg_list = bed_regions.readlines()
            args.outRawCounts.write("#'chr'\t'start'\t'end'\t")
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n"
            for idx, row in enumerate(num_reads_per_bin):
                args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3]))
                args.outRawCounts.write(fmt % tuple(row))

        else:
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n"
            for row in num_reads_per_bin:
                args.outRawCounts.write(fmt.format(*tuple(row)))
        """
        f.close()
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  the scores

    """
    args = process_args(args)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    if len(args.bwfiles) == 1 and not args.outRawCounts:
        sys.stderr.write("You've input a single bigWig file and not specified "
                         "--outRawCounts. The resulting output will NOT be "
                         "useful with any deepTools program!\n")

    # Preload deepBlue files, which need to then be deleted
    deepBlueFiles = []
    for idx, fname in enumerate(args.bwfiles):
        if db.isDeepBlue(fname):
            deepBlueFiles.append([fname, idx])
    if len(deepBlueFiles) > 0:
        sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles])))
        if 'BED' in args:
            regs = db.makeRegions(args.BED, args)
        else:
            foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey)
            regs = db.makeTiles(foo, args)
            del foo
        for x in deepBlueFiles:
            x.extend([args, regs])
        if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1:
            pool = multiprocessing.Pool(args.numberOfProcessors)
            res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999)
        else:
            res = list(map(db.preloadWrapper, deepBlueFiles))

        # substitute the file names with the temp files
        for (ftuple, r) in zip(deepBlueFiles, res):
            args.bwfiles[ftuple[1]] = r
        deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles]
        del regs

    num_reads_per_bin = score_bw.getScorePerBin(
        args.bwfiles,
        args.binSize,
        blackListFileName=args.blackListFileName,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts,
        allArgs=args)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    f = open(args.outFileName, "wb")
    np.savez_compressed(f,
                        matrix=num_reads_per_bin,
                        labels=args.labels)
    f.close()

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        f = open(args.outRawCounts, "r+")
        content = f.read()
        f.seek(0, 0)
        f.write(header + content)

        """
        if bed_regions:
            bed_regions.seek(0)
            reg_list = bed_regions.readlines()
            args.outRawCounts.write("#'chr'\t'start'\t'end'\t")
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n"
            for idx, row in enumerate(num_reads_per_bin):
                args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3]))
                args.outRawCounts.write(fmt % tuple(row))

        else:
            args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n")
            fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n"
            for row in num_reads_per_bin:
                args.outRawCounts.write(fmt.format(*tuple(row)))
        """
        f.close()

    # Clean up temporary bigWig files, if applicable
    if not args.deepBlueKeepTemp:
        for k, v in deepBlueFiles:
            os.remove(args.bwfiles[v])
    else:
        for k, v in deepBlueFiles:
            print("{} is stored in {}".format(k, args.bwfiles[v]))
Beispiel #7
0
def main(args=None):
    """
    1. get read counts at different positions either
    all of same length or from genomic regions from the BED file

    2. compute  correlation

    """
    args = process_args(args)

    if len(args.bwfiles) < 2:
        print "Please input at least two bigWig (.bw) files to compare"
        exit(1)

    if 'BED' in args:
        bed_regions = args.BED
    else:
        bed_regions = None

    bwFiles = []
    for fname in args.bwfiles:
        f = fname.name
        fname.close()
        if f:
            bwFiles.append(f)

    if len(bwFiles) == 0:
        print "No valid bigwig files"
        exit(1)

    num_reads_per_bin = score_bw.getScorePerBin(
        bwFiles,
        args.binSize,
        numberOfProcessors=args.numberOfProcessors,
        stepSize=args.binSize + args.distanceBetweenBins,
        verbose=args.verbose,
        region=args.region,
        bedFile=bed_regions,
        chrsToSkip=args.chromosomesToSkip,
        out_file_for_raw_data=args.outRawCounts)

    sys.stderr.write("Number of bins "
                     "found: {}\n".format(num_reads_per_bin.shape[0]))

    if num_reads_per_bin.shape[0] < 2:
        exit("ERROR: too few non zero bins found.\n"
             "If using --region please check that this "
             "region is covered by reads.\n")

    np.savez_compressed(args.outFileName,
                        matrix=num_reads_per_bin,
                        labels=args.labels)

    if args.outRawCounts:
        # append to the generated file the
        # labels
        header = "#'chr'\t'start'\t'end'\t"
        header += "'" + "'\t'".join(args.labels) + "'\n"
        # import ipdb;ipdb.set_trace()
        with open(args.outRawCounts.name, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write(header + content)
        """
Beispiel #8
0
def get_labels_and_correlation(
        bw_files,
        # chrs_to_skip,
        bin_size=10000,
        method='pearson',
        fileset_name='result',
        blacklist=None,
        labels=bw_labels,
        output_dir='/Users/baditya02/Downloads/treatment-data/graphs/test/'
):
    my_listUnnested = []
    my_labels_list = []
    assert method in ('pearson', 'spearman'), 'Invalid correlation method'
    # Autogenerate labels from filename if not provided
    if not labels:
        labels = [filename.split( '/' )[-1].split( '.' )[0] for filename in bw_files]
    # Generate a name for the unique combination
    test_name = fileset_name + '_' + method
    if blacklist:
        blacklist_title = 'Blacklisted '
        test_name += '_blacklisted'
    else:
        blacklist_title = ''
    image_name = test_name + '.png'
    # Bin the bigwig data in 10kb increments
    num_reads_per_bin = score_bw.getScorePerBin(
        bw_files,
        bin_size,
        # chrsToSkip=chrs_to_skip,
        blackListFileName=blacklist
    )
    # Write to npz file
    filename = output_dir + test_name + '.npz'
    with open(filename, "wb") as f:
        np.savez_compressed(f, matrix=num_reads_per_bin, labels=labels)
    # Compute the correlations
    corr = Correlation(filename, method, labels=labels)
    np_array = corr.compute_correlation()
    listNested = np_array.tolist()

    def removeNestings(listNest):
        for i in listNest:
            if type(i) == list:
                removeNestings(i)
            else:
                my_listUnnested.append(i)

    removeNestings(listNested)

    plot_title = '{}{} Correlation of {}'.format(
        blacklist_title,
        method.capitalize(),
        fileset_name
    )
    # Create a png file of correlation heatmap
    image_path = output_dir + image_name
    corr.plot_correlation( image_path, plot_title=plot_title )

    # return np_ar
    my_labels_list = labels
    return image_path, my_labels_list, my_listUnnested