def processCellType(args):
    os.makedirs(os.path.join(args.outDir), exist_ok=True)
    write_params(args, os.path.join(args.outDir, "params.txt"))

    #Make candidate regions
    if not args.ignoreSummits:
        make_candidate_regions_from_summits(
            macs_peaks=args.narrowPeak,
            accessibility_file=args.bam,
            genome_sizes=args.chrom_sizes,
            regions_includelist=args.regions_includelist,
            regions_blocklist=args.regions_blocklist,
            n_enhancers=args.nStrongestPeaks,
            peak_extend=args.peakExtendFromSummit,
            outdir=args.outDir)
    else:
        make_candidate_regions_from_peaks(
            macs_peaks=args.narrowPeak,
            accessibility_file=args.bam,
            genome_sizes=args.chrom_sizes,
            regions_includelist=args.regions_includelist,
            regions_blocklist=args.regions_blocklist,
            n_enhancers=args.nStrongestPeaks,
            peak_extend=args.peakExtendFromSummit,
            minPeakWidth=args.minPeakWidth,
            outdir=args.outDir)
Beispiel #2
0
def makeCandidateRegions(
    narrowPeak: str,
    input_bam: str,
    output_dir: str,
    chrom_sizes: str,
    tmpdir: str,
    regions_blacklist: str = None,
    regions_whitelist: str = None,
    peakExtendFromSummit: int = 250,
    nStrongestPeaks: int = 175000,
    ignoreSummits: bool = False,
    minPeakWidth: int = 500,
):
    """
    Inputs:
    narrowPeak: narrowPeak file output by macs2. Must include summits (--call-summits)
    input_bam: DNAase-Seq or atac-Seq input_bam file
    chrom_sizes: File listing chromosome size annotations
    output_dir: output folder where results will be stored; this is created if it doesn't exist
    nStrongestPeaks: Number of peaks to use for defining candidate regions
    peakExtendFromSummit: Number of base pairs to extend each preak from its summit (or from both ends of region if using --ignoreSummits)
    ignoreSummits: Compute peaks using the full peak regions, rather than extending from summit.
    minPeakWidth: Candidate regions whose width is below this threshold are expanded to this width. Only used with --ignoreSummits
    regions_whitelist: Bed file of regions to forcibly include in candidate enhancers. Overrides regions_blacklist
    regions_blacklist: Bed file of regions to forcibly exclude from candidate enhancers
    """
    # create output directory.
    # if output directory is in s3, create local directory from it's basename to serve as workdir
    makedirs(output_dir)
    makedirs(tmpdir)

    write_params(
        {
            "narrowPeak": narrowPeak,
            "input_bam": input_bam,
            "output_dir": output_dir,
            "tmpdir": tmpdir,
            "chrom_sizes": chrom_sizes,
            "regions_blacklist": regions_blacklist,
            "regions_whitelist": regions_whitelist,
            "peakExtendFromSummit": peakExtendFromSummit,
            "nStrongestPeaks": nStrongestPeaks,
            "ignoreSummits": ignoreSummits,
            "minPeakWidth": minPeakWidth,
        },
        output_dir,
        tmpdir,
        "parameters.txt",
    )
    # 1. Count dhs/atac reads in candidate regions
    raw_counts_outfile = join(
        output_dir,
        basename(narrowPeak) + "." + basename(input_bam) + ".Counts.bed")

    run_count_reads_out = run_count_reads(
        target=input_bam,
        output=raw_counts_outfile,
        output_dir=output_dir,
        tmpdir=tmpdir,
        bed_file=narrowPeak,
        chrom_sizes=chrom_sizes,
        use_fast_count=True,
    )

    # Make candidate regions
    if not ignoreSummits:
        return make_candidate_regions_from_summits(
            count_file=run_count_reads_out["path"],
            macs_peaks=narrowPeak,
            chrom_sizes=chrom_sizes,
            regions_whitelist=regions_whitelist,
            regions_blacklist=regions_blacklist,
            n_enhancers=nStrongestPeaks,
            peak_extend=peakExtendFromSummit,
            output_dir=output_dir,
            tmpdir=tmpdir,
        )
    else:
        return make_candidate_regions_from_peaks(
            count_file=run_count_reads_out["path"],
            macs_peaks=narrowPeak,
            chrom_sizes=chrom_sizes,
            regions_whitelist=regions_whitelist,
            regions_blacklist=regions_blacklist,
            n_enhancers=nStrongestPeaks,
            peak_extend=peakExtendFromSummit,
            minPeakWidth=minPeakWidth,
            output_dir=output_dir,
            tmpdir=tmpdir,
        )
Beispiel #3
0
def main():
    args = parseargs()
    os.makedirs(args.outDir, exist_ok=True)

    # Write params file
    write_params(args, os.path.join(args.outDir, "params.txt"))

    # Parse cell types
    cell_types = args.celltypes.split(",")

    # chromosomes = ['chr' + str(x) for x in range(1,23)] + ['chrX']
    # chromosomes = ['chr22']

    special_value = np.Inf

    # for chromosome in chromosomes:
    hic_list = [
        process_chr(
            cell_type,
            args.chromosome,
            args.basedir,
            args.resolution,
            args.ref_scale,
            args.ref_gamma,
            special_value,
        )
        for cell_type in cell_types
    ]
    hic_list = [x for x in hic_list if x is not None]
    hic_list = [df.set_index(["bin1", "bin2"]) for df in hic_list]

    # Make average
    # Merge all hic matrices
    # Need to deal with nan vs 0 here. In the KR normalized matrices there are nan which we want to deal as missing.
    # Rows that are not present in the hic dataframe should be considered 0
    # But after doing an outer join these rows will be represented as nan in the merged dataframe.
    # So need a way to distinguish nan vs 0.
    # Hack: convert all nan in the celltype specific hic dataframes to a special value. Then replace this special value after merging
    # TO DO: This is very memory intensive! (consider pandas.join or pandas.concat)
    # import pdb

    all_hic = pd.concat(hic_list, axis=1, join="outer", copy=False)
    hic_list = None  # Clear from memory

    all_hic.fillna(value=0, inplace=True)
    all_hic.replace(to_replace=special_value, value=np.nan, inplace=True)

    # compute the average
    cols_for_avg = list(filter(lambda x: "hic_kr" in x, all_hic.columns))

    # all_hic['avg_hic'] = all_hic[cols_for_avg].mean(axis=1)
    # avg_hic = all_hic[cols_for_avg].mean(axis=1)
    avg_hic = all_hic.mean(axis=1)
    num_good = len(cols_for_avg) - np.isnan(all_hic).sum(axis=1)

    # Check minimum number of cols
    all_hic.drop(cols_for_avg, inplace=True, axis=1)
    all_hic.reset_index(level=all_hic.index.names, inplace=True)
    all_hic["avg_hic"] = avg_hic.values
    all_hic.loc[num_good.values < args.min_cell_types_required, "avg_hic"] = np.nan

    # Setup final matrix
    all_hic["bin1"] = all_hic["bin1"] * args.resolution
    all_hic["bin2"] = all_hic["bin2"] * args.resolution
    all_hic = all_hic.loc[
        np.logical_or(all_hic["avg_hic"] > 0, np.isnan(all_hic["avg_hic"])),
    ]  # why do these 0's exist?

    os.makedirs(os.path.join(args.outDir, args.chromosome), exist_ok=True)
    all_hic.to_csv(
        os.path.join(args.outDir, args.chromosome, args.chromosome + ".avg.gz"),
        sep="\t",
        header=False,
        index=False,
        compression="gzip",
        na_rep=np.nan,
    )