예제 #1
0
def discretize_genome(wsize, stepsize, genome):
    out_bed = bed.BedFile()
    for chrm in genome:
        chrm_name = chrm.chrm_name()
        for i in range(0, len(chrm) - wsize, stepsize):
            start = i
            end = i + wsize
            name = "%s_%i_%i" % (chrm_name, start, end)
            out_bed.add_entry(bed.BedEntry([chrm_name, start, end, name, 0]))
    return out_bed
예제 #2
0
def query_main(args):
    import bed_utils

    inbed = bed_utils.BedFile()
    inbed.from_bed_file(args.regions)
    res = args.res
    all_bws, open_fhandles = read_multiple_bws(args.infiles, res=res)
    if args.gzip:
        import gzip

    if args.samp_names:
        samp_to_fname = {
            samp_name: fname
            for fname, samp_name in zip(args.infiles, args.samp_names)
        }
        samp_names = args.samp_names
    else:
        samp_to_fname = {fname: fname for fname in args.infiles}
        samp_names = args.infiles

    summary_funcs = {
        'mean': np.nanmean,
        'median': np.nanmedian,
        'max': np.nanmax,
        'min': np.nanmin,
        'RPP': relative_polymerase_progression,
        'TR': lambda array: traveling_ratio(array, args.res, 50, 1000),
        'summit_loc':
        lambda array: summit_loc(array, args.res, 50, args.upstream)
    }
    try:
        summary_func = summary_funcs[args.summary_func]
    except KeyError:
        KeyError("%s is not a valid option for --summary_func" %
                 (args.summary_func))

    overall_funcs = {
        'identity':
        query_summarize_identity,
        'single':
        lambda x, y, z, a, b, c: query_summarize_single(
            x, y, z, a, b, summary_func, args.frac_na, c)
    }
    try:
        overall_func = overall_funcs[args.summarize]
    except KeyError:
        KeyError("%s is not a valid option for --summarize" % (args.summarize))

    overall_func(all_bws, samp_names, samp_to_fname, inbed, res, gzip)

    for fhandle in open_fhandles:
        fhandle.close()
예제 #3
0
def fixed_scale(arrays, fixed_regions=None, res=1, summary_func=np.nanmean):
    import bed_utils
    inbed = bed_utils.BedFile()
    inbed.from_bed_file(fixed_regions)
    fixed_vals = []
    for region in inbed:
        fixed_vals.extend(arrays[region["chrm"]][region["start"] //
                                                 res:region["end"] // res])
    fixed_vals = np.array(fixed_vals)
    scale_val = summary_func(fixed_vals[np.isfinite(fixed_vals)])
    for chrm in arrays.keys():
        arrays[chrm] = arraytools.normalize_1D(arrays[chrm], 0, scale_val)
    return arrays
예제 #4
0
def convert_file_to_intervals(fname, function_factory):
    new_interval = Intervals()
    if fname.endswith(".bed") or fname.endswith(".narrowPeak"):
        this_file =bed_utils.BedFile()
        this_file.from_bed_file(fname)
        new_interval.from_other_ftype(this_file, function_factory[".bed"])

    elif fname.endswith(".gff"):
        this_file =gfftools.GffData()
        this_file.parse_gff_file(fname)
        new_interval.from_other_ftype(this_file, function_factory[".gff"])
    else:
        raise ValueError("%s filetype not supported yet"%fname)
    return new_interval
예제 #5
0
def get_values_per_region(arrays, query_regions, res=1):
    import bed_utils
    inbed = bed_utils.BedFile()
    inbed.from_bed_file(query_regions)
    region_averages = []
    region_names = []
    for region in inbed:
        region_vals = arrays[region["chrm"]][region["start"] //
                                             res:region["end"] // res]
        region_average = np.nanmean(region_vals[np.isfinite(region_vals)])
        region_averages.append(region_average)
        region_names.append(region["name"])
    region_averages = np.array(region_averages)
    region_names = np.array(region_names)
    finite_vals = np.isfinite(region_averages)
    return (region_names[finite_vals], region_averages[finite_vals])
예제 #6
0
def dense_sampling_main(args):

    # parse arguments
    args = parser.parse_args()
    # figure out random seed
    np.random.seed(args.seed)
    # read in genome
    genome = fa.FastaFile()
    logging.warning("reading in full genome")
    with open(args.fasta) as inf:
        genome.read_whole_file(inf)

    # read in bed file
    inbed = bed.BedFile()
    logging.warning("reading in bed")
    inbed.from_bed_file(args.bedfile)

    # discretize the genome by size of window and step of window
    outbed = discretize_genome(args.wsize, args.stepsize, genome)

    # convert input bed to an interval file by chromosome
    intervals = {chrm.chrm_name(): it.Intervals() for chrm in genome}

    for feature in inbed:
        this_interval = intervals[feature["chrm"]]
        this_interval.add_interval(
            it.Interval(feature["start"], feature["end"]))
        intervals[feature["chrm"]] = this_interval

    # figure out which intervals overlap and which don't

    logging.warning("determining which intervals overlap")
    positive_bed = bed.BedFile()
    negative_bed = bed.BedFile()
    for i, window in enumerate(outbed):
        if i % 10000 == 0:
            logging.warning("Checking interval %s" % i)
        this_chrm = window["chrm"]
        this_intervals = intervals[this_chrm]
        window_interval = it.Interval(window["start"], window["end"])
        perc_overlap = this_intervals.check_percent_overlap(window_interval)
        if perc_overlap >= args.perc_overlap:
            positive_bed.add_entry(window)
        else:
            negative_bed.add_entry(window)
    # make fire file
    fire = FIREfile()
    out_fasta = fa.FastaFile()
    for feature in positive_bed:
        this_chrm = feature["chrm"]
        this_name = feature["name"]
        this_start = feature["start"]
        this_end = feature["end"]
        fire.add_entry(this_name, args.default_score)
        out_fasta.add_entry(
            fa.FastaEntry(
                ">" + this_name,
                genome.pull_entry(this_chrm).pull_seq(this_start, this_end)))

    for feature in negative_bed:
        this_chrm = feature["chrm"]
        this_name = feature["name"]
        this_start = feature["start"]
        this_end = feature["end"]
        fire.add_entry(this_name, args.rand_score)
        out_fasta.add_entry(
            fa.FastaEntry(
                ">" + this_name,
                genome.pull_entry(this_chrm).pull_seq(this_start, this_end)))

    # write files

    if args.true_bed:
        positive_bed.write_bed_file(args.outpre + "_true.bed")
    if args.rand_bed:
        negative_bed.write_bed_file(args.outpre + "_rand.bed")

    if not args.no_fasta:
        with open(args.outpre + ".fa") as outf:
            out_fasta.write(outf)

    fire.write(args.outpre + "_fire.txt")
예제 #7
0
    # parse arguments
    args = parser.parse_args()
    if args.dense:
        dense_sampling_main(args)
        sys.exit()
    # figure out random seed
    np.random.seed(args.seed)
    # read in genome
    genome = fa.FastaFile()
    logging.warning("reading in full genome")
    with open(args.fasta) as inf:
        genome.read_whole_file(inf)

    # read in bed file
    inbed = bed.BedFile()
    logging.warning("reading in bed")
    inbed.from_bed_file(args.bedfile)

    # check how much of the genome the regions cover
    genome_length = {}
    total_length = 0
    for chrm in genome:
        genome_length[chrm] = len(chrm)
        total_length += len(chrm)
    feature_length = 0
    for feature in inbed:
        this_chrm = genome.pull_entry(feature["chrm"])
        this_start, this_end, this_rc = determine_start_end(
            feature, this_chrm, args)
        feature_length += this_end - this_start
예제 #8
0
    for fafile, fafilename in zip(all_fastas, fasta_files):
        with open(fafilename, mode="r") as inf:
            fafile.read_whole_file(inf)

    final_fasta = fa.FastaFile()
    for fafile in all_fastas:
        for entry in fafile:
            final_fasta.add_entry(entry)
    lengths = {}
    for entry in final_fasta:
        chrm_name = entry.chrm_name()
        lengths[chrm_name] = len(entry)

    # Replace masked regions with Ns
    if args.masked_regions:
        masked_regions = bed.BedFile()
        masked_regions.from_bed_file(args.masked_regions)
        for entry in masked_regions:
            total_region_length = entry["end"] - entry["start"]
            this_chrm = final_fasta.pull_entry(entry["chrm"])
            this_chrm.mutate(entry["start"], entry["end"],
                             "N" * total_region_length)

    # Figure out total size of each chromosome
    with open(genome_name + "_contig_sizes.tsv", mode="w") as outf:
        for chrm, length in lengths.items():
            outf.write("%s\t%s\n" % (chrm, length))

    # figure out total mappable size of the genome
    with open(genome_name + "_mappable_size.txt", mode="w") as outf:
        N_size = 0