def discretize_genome(wsize, stepsize, genome): out_bed = bed.BedFile() for chrm in genome: chrm_name = chrm.chrm_name() for i in range(0, len(chrm) - wsize, stepsize): start = i end = i + wsize name = "%s_%i_%i" % (chrm_name, start, end) out_bed.add_entry(bed.BedEntry([chrm_name, start, end, name, 0])) return out_bed
def query_main(args): import bed_utils inbed = bed_utils.BedFile() inbed.from_bed_file(args.regions) res = args.res all_bws, open_fhandles = read_multiple_bws(args.infiles, res=res) if args.gzip: import gzip if args.samp_names: samp_to_fname = { samp_name: fname for fname, samp_name in zip(args.infiles, args.samp_names) } samp_names = args.samp_names else: samp_to_fname = {fname: fname for fname in args.infiles} samp_names = args.infiles summary_funcs = { 'mean': np.nanmean, 'median': np.nanmedian, 'max': np.nanmax, 'min': np.nanmin, 'RPP': relative_polymerase_progression, 'TR': lambda array: traveling_ratio(array, args.res, 50, 1000), 'summit_loc': lambda array: summit_loc(array, args.res, 50, args.upstream) } try: summary_func = summary_funcs[args.summary_func] except KeyError: KeyError("%s is not a valid option for --summary_func" % (args.summary_func)) overall_funcs = { 'identity': query_summarize_identity, 'single': lambda x, y, z, a, b, c: query_summarize_single( x, y, z, a, b, summary_func, args.frac_na, c) } try: overall_func = overall_funcs[args.summarize] except KeyError: KeyError("%s is not a valid option for --summarize" % (args.summarize)) overall_func(all_bws, samp_names, samp_to_fname, inbed, res, gzip) for fhandle in open_fhandles: fhandle.close()
def fixed_scale(arrays, fixed_regions=None, res=1, summary_func=np.nanmean): import bed_utils inbed = bed_utils.BedFile() inbed.from_bed_file(fixed_regions) fixed_vals = [] for region in inbed: fixed_vals.extend(arrays[region["chrm"]][region["start"] // res:region["end"] // res]) fixed_vals = np.array(fixed_vals) scale_val = summary_func(fixed_vals[np.isfinite(fixed_vals)]) for chrm in arrays.keys(): arrays[chrm] = arraytools.normalize_1D(arrays[chrm], 0, scale_val) return arrays
def convert_file_to_intervals(fname, function_factory): new_interval = Intervals() if fname.endswith(".bed") or fname.endswith(".narrowPeak"): this_file =bed_utils.BedFile() this_file.from_bed_file(fname) new_interval.from_other_ftype(this_file, function_factory[".bed"]) elif fname.endswith(".gff"): this_file =gfftools.GffData() this_file.parse_gff_file(fname) new_interval.from_other_ftype(this_file, function_factory[".gff"]) else: raise ValueError("%s filetype not supported yet"%fname) return new_interval
def get_values_per_region(arrays, query_regions, res=1): import bed_utils inbed = bed_utils.BedFile() inbed.from_bed_file(query_regions) region_averages = [] region_names = [] for region in inbed: region_vals = arrays[region["chrm"]][region["start"] // res:region["end"] // res] region_average = np.nanmean(region_vals[np.isfinite(region_vals)]) region_averages.append(region_average) region_names.append(region["name"]) region_averages = np.array(region_averages) region_names = np.array(region_names) finite_vals = np.isfinite(region_averages) return (region_names[finite_vals], region_averages[finite_vals])
def dense_sampling_main(args): # parse arguments args = parser.parse_args() # figure out random seed np.random.seed(args.seed) # read in genome genome = fa.FastaFile() logging.warning("reading in full genome") with open(args.fasta) as inf: genome.read_whole_file(inf) # read in bed file inbed = bed.BedFile() logging.warning("reading in bed") inbed.from_bed_file(args.bedfile) # discretize the genome by size of window and step of window outbed = discretize_genome(args.wsize, args.stepsize, genome) # convert input bed to an interval file by chromosome intervals = {chrm.chrm_name(): it.Intervals() for chrm in genome} for feature in inbed: this_interval = intervals[feature["chrm"]] this_interval.add_interval( it.Interval(feature["start"], feature["end"])) intervals[feature["chrm"]] = this_interval # figure out which intervals overlap and which don't logging.warning("determining which intervals overlap") positive_bed = bed.BedFile() negative_bed = bed.BedFile() for i, window in enumerate(outbed): if i % 10000 == 0: logging.warning("Checking interval %s" % i) this_chrm = window["chrm"] this_intervals = intervals[this_chrm] window_interval = it.Interval(window["start"], window["end"]) perc_overlap = this_intervals.check_percent_overlap(window_interval) if perc_overlap >= args.perc_overlap: positive_bed.add_entry(window) else: negative_bed.add_entry(window) # make fire file fire = FIREfile() out_fasta = fa.FastaFile() for feature in positive_bed: this_chrm = feature["chrm"] this_name = feature["name"] this_start = feature["start"] this_end = feature["end"] fire.add_entry(this_name, args.default_score) out_fasta.add_entry( fa.FastaEntry( ">" + this_name, genome.pull_entry(this_chrm).pull_seq(this_start, this_end))) for feature in negative_bed: this_chrm = feature["chrm"] this_name = feature["name"] this_start = feature["start"] this_end = feature["end"] fire.add_entry(this_name, args.rand_score) out_fasta.add_entry( fa.FastaEntry( ">" + this_name, genome.pull_entry(this_chrm).pull_seq(this_start, this_end))) # write files if args.true_bed: positive_bed.write_bed_file(args.outpre + "_true.bed") if args.rand_bed: negative_bed.write_bed_file(args.outpre + "_rand.bed") if not args.no_fasta: with open(args.outpre + ".fa") as outf: out_fasta.write(outf) fire.write(args.outpre + "_fire.txt")
# parse arguments args = parser.parse_args() if args.dense: dense_sampling_main(args) sys.exit() # figure out random seed np.random.seed(args.seed) # read in genome genome = fa.FastaFile() logging.warning("reading in full genome") with open(args.fasta) as inf: genome.read_whole_file(inf) # read in bed file inbed = bed.BedFile() logging.warning("reading in bed") inbed.from_bed_file(args.bedfile) # check how much of the genome the regions cover genome_length = {} total_length = 0 for chrm in genome: genome_length[chrm] = len(chrm) total_length += len(chrm) feature_length = 0 for feature in inbed: this_chrm = genome.pull_entry(feature["chrm"]) this_start, this_end, this_rc = determine_start_end( feature, this_chrm, args) feature_length += this_end - this_start
for fafile, fafilename in zip(all_fastas, fasta_files): with open(fafilename, mode="r") as inf: fafile.read_whole_file(inf) final_fasta = fa.FastaFile() for fafile in all_fastas: for entry in fafile: final_fasta.add_entry(entry) lengths = {} for entry in final_fasta: chrm_name = entry.chrm_name() lengths[chrm_name] = len(entry) # Replace masked regions with Ns if args.masked_regions: masked_regions = bed.BedFile() masked_regions.from_bed_file(args.masked_regions) for entry in masked_regions: total_region_length = entry["end"] - entry["start"] this_chrm = final_fasta.pull_entry(entry["chrm"]) this_chrm.mutate(entry["start"], entry["end"], "N" * total_region_length) # Figure out total size of each chromosome with open(genome_name + "_contig_sizes.tsv", mode="w") as outf: for chrm, length in lengths.items(): outf.write("%s\t%s\n" % (chrm, length)) # figure out total mappable size of the genome with open(genome_name + "_mappable_size.txt", mode="w") as outf: N_size = 0