def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. compute the scores """ args = process_args(args) if 'BED' in args: bed_regions = args.BED else: bed_regions = None if len(args.bwfiles) == 1 and not args.outRawCounts: sys.stderr.write("You've input a single bigWig file and not specified " "--outRawCounts. The resulting output will NOT be " "useful with any deepTools program!\n") num_reads_per_bin = score_bw.getScorePerBin( args.bwfiles, args.binSize, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, stepSize=args.binSize + args.distanceBetweenBins, verbose=args.verbose, region=args.region, bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") np.savez_compressed(args.outFileName, matrix=num_reads_per_bin, labels=args.labels) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" # import ipdb;ipdb.set_trace() with open(args.outRawCounts.name, 'r+') as f: content = f.read() f.seek(0, 0) f.write(header + content) """
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. compute the scores """ args = process_args(args) if 'BED' in args: bed_regions = args.BED else: bed_regions = None if len(args.bwfiles) == 1 and not args.outRawCounts: sys.stderr.write("You've input a single bigWig file and not specified " "--outRawCounts. The resulting output will NOT be " "useful with any deepTools program!\n") # Preload deepBlue files, which need to then be deleted deepBlueFiles = [] for idx, fname in enumerate(args.bwfiles): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: sys.stderr.write( "Preloading the following deepBlue files: {}\n".format(",".join( [x[0] for x in deepBlueFiles]))) if 'BED' in args: regs = db.makeRegions(args.BED, args) else: foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) regs = db.makeTiles(foo, args) del foo for x in deepBlueFiles: x.extend([args, regs]) if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1: pool = multiprocessing.Pool(args.numberOfProcessors) res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999) else: res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files for (ftuple, r) in zip(deepBlueFiles, res): args.bwfiles[ftuple[1]] = r deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] del regs num_reads_per_bin = score_bw.getScorePerBin( args.bwfiles, args.binSize, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, stepSize=args.binSize + args.distanceBetweenBins, verbose=args.verbose, region=args.region, bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts, allArgs=args) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") f = open(args.outFileName, "wb") np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, "r+") content = f.read() f.seek(0, 0) f.write(header + content) """ if bed_regions: bed_regions.seek(0) reg_list = bed_regions.readlines() args.outRawCounts.write("#'chr'\t'start'\t'end'\t") args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n" for idx, row in enumerate(num_reads_per_bin): args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3])) args.outRawCounts.write(fmt % tuple(row)) else: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt.format(*tuple(row))) """ f.close() # Clean up temporary bigWig files, if applicable if not args.deepBlueKeepTemp: for k, v in deepBlueFiles: os.remove(args.bwfiles[v]) else: for k, v in deepBlueFiles: print("{} is stored in {}".format(k, args.bwfiles[v]))
def get_labels_and_correlation( bw_files, # chrs_to_skip, bin_size=10000, method='pearson', fileset_name='result', blacklist=None, labels=bw_labels, output_dir=BASE_DIR): my_listUnnested = [] # my_labels_list = [] assert method in ('pearson', 'spearman'), 'Invalid correlation method' # Autogenerate labels from filename if not provided if not labels: labels = [ filename.split('/')[-1].split('.')[0] for filename in bw_files ] # Generate a name for the unique combination test_name = fileset_name + '_' + method if blacklist: blacklist_title = 'Blacklisted' test_name += '_blacklisted' else: blacklist_title = '' image_name = test_name + '.png' # Bin the bigwig data in 10kb increments num_reads_per_bin = score_bw.getScorePerBin( bw_files, bin_size, # chrsToSkip=chrs_to_skip, blackListFileName=blacklist) # Write to npz file print("right before npz") os.system('pwd') os.system('ls -lat') print('svo') print(output_dir) print(test_name) filename = output_dir + test_name + '.npz' print(filename) print('svo.. ') with open(filename, "wb") as f: np.savez_compressed(f, matrix=num_reads_per_bin, labels=labels) # Compute the correlations corr = Correlation(filename, method, labels=labels) np_array = corr.compute_correlation() print("ALERT CORR") print(np_array) listNested = np_array.tolist() def removeNestings(listNest): for i in listNest: if type(i) == list: print(str(i) + "is: " + str(type(i))) removeNestings(i) print(str(i) + "is after: " + str(type(i))) else: print(str(i) + "is finally " + str(type(i))) my_listUnnested.append(i) removeNestings(listNested) print("FINAL CORRELATION VALUES") print(my_listUnnested) with open("corrScores.txt", "w") as f: f.write(str(my_listUnnested)) plot_title = '{}{} Correlation of {}'.format(blacklist_title, method.capitalize(), fileset_name) # Create a png file of correlation heatmap image_path = output_dir + image_name corr.plot_correlation(image_path, plot_title=plot_title) # return np_ar my_labels_list = labels return image_path, my_labels_list, my_listUnnested
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. compute the scores """ args = process_args(args) if 'BED' in args: bed_regions = args.BED else: bed_regions = None if len(args.bwfiles) == 1 and not args.outRawCounts: sys.stderr.write("You've input a single bigWig file and not specified " "--outRawCounts. The resulting output will NOT be " "useful with any deepTools program!\n") num_reads_per_bin = score_bw.getScorePerBin( args.bwfiles, args.binSize, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, stepSize=args.binSize + args.distanceBetweenBins, verbose=args.verbose, region=args.region, bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts, allArgs=args) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") f = open(args.outFileName, "wb") np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, "r+") content = f.read() f.seek(0, 0) f.write(header + content) """ if bed_regions: bed_regions.seek(0) reg_list = bed_regions.readlines() args.outRawCounts.write("#'chr'\t'start'\t'end'\t") args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n" for idx, row in enumerate(num_reads_per_bin): args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3])) args.outRawCounts.write(fmt % tuple(row)) else: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt.format(*tuple(row))) """ f.close()
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. compute the scores """ args = process_args(args) if 'BED' in args: bed_regions = args.BED else: bed_regions = None if len(args.bwfiles) == 1 and not args.outRawCounts: sys.stderr.write("You've input a single bigWig file and not specified " "--outRawCounts. The resulting output will NOT be " "useful with any deepTools program!\n") # Preload deepBlue files, which need to then be deleted deepBlueFiles = [] for idx, fname in enumerate(args.bwfiles): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) if 'BED' in args: regs = db.makeRegions(args.BED, args) else: foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) regs = db.makeTiles(foo, args) del foo for x in deepBlueFiles: x.extend([args, regs]) if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1: pool = multiprocessing.Pool(args.numberOfProcessors) res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999) else: res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files for (ftuple, r) in zip(deepBlueFiles, res): args.bwfiles[ftuple[1]] = r deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] del regs num_reads_per_bin = score_bw.getScorePerBin( args.bwfiles, args.binSize, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, stepSize=args.binSize + args.distanceBetweenBins, verbose=args.verbose, region=args.region, bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts, allArgs=args) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") f = open(args.outFileName, "wb") np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" f = open(args.outRawCounts, "r+") content = f.read() f.seek(0, 0) f.write(header + content) """ if bed_regions: bed_regions.seek(0) reg_list = bed_regions.readlines() args.outRawCounts.write("#'chr'\t'start'\t'end'\t") args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('%s', num_reads_per_bin.shape[1])) + "\n" for idx, row in enumerate(num_reads_per_bin): args.outRawCounts.write("{}\t{}\t{}\t".format(*reg_list[idx].strip().split("\t")[0:3])) args.outRawCounts.write(fmt % tuple(row)) else: args.outRawCounts.write("'" + "'\t'".join(args.labels) + "'\n") fmt = "\t".join(np.repeat('{}', num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: args.outRawCounts.write(fmt.format(*tuple(row))) """ f.close() # Clean up temporary bigWig files, if applicable if not args.deepBlueKeepTemp: for k, v in deepBlueFiles: os.remove(args.bwfiles[v]) else: for k, v in deepBlueFiles: print("{} is stored in {}".format(k, args.bwfiles[v]))
def main(args=None): """ 1. get read counts at different positions either all of same length or from genomic regions from the BED file 2. compute correlation """ args = process_args(args) if len(args.bwfiles) < 2: print "Please input at least two bigWig (.bw) files to compare" exit(1) if 'BED' in args: bed_regions = args.BED else: bed_regions = None bwFiles = [] for fname in args.bwfiles: f = fname.name fname.close() if f: bwFiles.append(f) if len(bwFiles) == 0: print "No valid bigwig files" exit(1) num_reads_per_bin = score_bw.getScorePerBin( bwFiles, args.binSize, numberOfProcessors=args.numberOfProcessors, stepSize=args.binSize + args.distanceBetweenBins, verbose=args.verbose, region=args.region, bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts) sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: exit("ERROR: too few non zero bins found.\n" "If using --region please check that this " "region is covered by reads.\n") np.savez_compressed(args.outFileName, matrix=num_reads_per_bin, labels=args.labels) if args.outRawCounts: # append to the generated file the # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" # import ipdb;ipdb.set_trace() with open(args.outRawCounts.name, 'r+') as f: content = f.read() f.seek(0, 0) f.write(header + content) """
def get_labels_and_correlation( bw_files, # chrs_to_skip, bin_size=10000, method='pearson', fileset_name='result', blacklist=None, labels=bw_labels, output_dir='/Users/baditya02/Downloads/treatment-data/graphs/test/' ): my_listUnnested = [] my_labels_list = [] assert method in ('pearson', 'spearman'), 'Invalid correlation method' # Autogenerate labels from filename if not provided if not labels: labels = [filename.split( '/' )[-1].split( '.' )[0] for filename in bw_files] # Generate a name for the unique combination test_name = fileset_name + '_' + method if blacklist: blacklist_title = 'Blacklisted ' test_name += '_blacklisted' else: blacklist_title = '' image_name = test_name + '.png' # Bin the bigwig data in 10kb increments num_reads_per_bin = score_bw.getScorePerBin( bw_files, bin_size, # chrsToSkip=chrs_to_skip, blackListFileName=blacklist ) # Write to npz file filename = output_dir + test_name + '.npz' with open(filename, "wb") as f: np.savez_compressed(f, matrix=num_reads_per_bin, labels=labels) # Compute the correlations corr = Correlation(filename, method, labels=labels) np_array = corr.compute_correlation() listNested = np_array.tolist() def removeNestings(listNest): for i in listNest: if type(i) == list: removeNestings(i) else: my_listUnnested.append(i) removeNestings(listNested) plot_title = '{}{} Correlation of {}'.format( blacklist_title, method.capitalize(), fileset_name ) # Create a png file of correlation heatmap image_path = output_dir + image_name corr.plot_correlation( image_path, plot_title=plot_title ) # return np_ar my_labels_list = labels return image_path, my_labels_list, my_listUnnested