Ejemplo n.º 1
0
def count_bam(
    bamfile: str,
    bed_file: str,
    output: str,
    tmpdir: str,
    output_dir: str,
    chrom_sizes: str,
    use_fast_count: bool = True,
    verbose: bool = True,
):
    local_bamfile = join(tmpdir, basename(bamfile))
    local_bed_file = join(tmpdir, basename(bed_file))
    local_chrom_sizes = join(tmpdir, basename(chrom_sizes))
    local_chrom_sizes_bed = ".".join([local_chrom_sizes, "bed"])
    chrom_sizes_bed = ".".join([chrom_sizes, "bed"])
    local_output = join(tmpdir, basename(output))
    if use_fast_count:
        temp_output = local_output + ".temp_sort_order"
        return script(
            f"""
        #!/bin/bash
        awk 'FNR==NR {{x2[$1] = $0; next}} $1 in x2 {{print x2[$1]}}' {local_chrom_sizes} <(samtools view -H {local_bamfile} | grep SQ | cut -f 2 | cut -c 4- )  > {temp_output};
        bedtools sort -faidx {temp_output} -i {local_bed_file} | bedtools coverage -g {temp_output} -counts  -a stdin -b {local_bamfile} | awk '{{print $1"\t"$2"\t"$3"\t"$NF}}'  | bedtools sort -faidx {local_chrom_sizes} -i stdin > {local_output}; rm {temp_output}
        """,
            inputs=[
                File(bamfile).stage(File(local_bamfile)),
                File(chrom_sizes).stage(File(local_chrom_sizes)),
                File(bed_file).stage(File(local_bed_file)),
            ],
            outputs={
                "file":
                File(join(output_dir,
                          basename(output))).stage(File(local_output)),
                "path":
                join(output_dir, basename(output)),
            },
        )
    else:
        return script(
            f"""
        #!/bin/bash
        bedtools bamtobed -i {local_bamfile} | cut -f 1-3 | bedtools intersect -wa -a stdin -b {local_chrom_sizes_bed} | bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools coverage -g {local_chrom_sizes} -counts -sorted -a {local_bed_file} -b stdin | awk '{{print $1"\t"$2"\t"$3"\t"$NF}}' > {local_output}
        """,
            inputs=[
                File(bamfile).stage(File(local_bamfile)),
                File(chrom_sizes).stage(File(local_chrom_sizes)),
                File(chrom_sizes_bed).stage(File(local_chrom_sizes_bed)),
                File(bed_file).stage(File(local_bed_file)),
            ],
            outputs={
                "file":
                File(join(output_dir,
                          basename(output))).stage(File(local_output)),
                "path":
                join(output_dir, basename(output)),
            },
        )
Ejemplo n.º 2
0
def count_tagalign_total(tagalign, tmpdir):
    local_tagalign = join(tmpdir, basename(tagalign))
    return script(
        f"""
    zcat local_tagalign | grep -E 'chr[1-9]|chr1[0-9]|chr2[0-2]|chrX|chrY' | wc -l
    """,
        inputs=[File(tagalign).stage(local_tagalign)],
    )
Ejemplo n.º 3
0
def count_bam_mapped(bam_file, tmpdir):
    # Counts number of reads in a BAM file WITHOUT iterating.  Requires that the BAM is indexed
    if bam_file.startswith("s3"):
        local_bam_file = join(tmpdir, basename(bam_file))
        download_file(bam_file, tmpdir, overwrite_ok=True)
    else:
        local_bam_file = bam_file
    return script(f"""
    samtools index {local_bam_file}
    samtools idxstats {local_bam_file} | grep -v '*' |cut -f3 | paste -sd+ | bc 
    """)
Ejemplo n.º 4
0
def download_raw(
    juicebox: str,
    hic_file: str,
    chromosome: str,
    output_dir: str,
    resolution: int,
    tmpdir: str,
):
    return script(
        f"""
    #!/bin/bash 
    {juicebox} dump observed NONE {hic_file} {chromosome} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.RAWobserved
    gzip -f {tmpdir}/chr{chromosome}.RAWobserved
    """,
        outputs=[Dir(output_dir).stage(Dir(tmpdir))],
    )
Ejemplo n.º 5
0
def download_observed_matrix(
    juicebox: str,
    hic_file: str,
    chromosome: str,
    output_dir: str,
    resolution: int,
    tmpdir: str,
):
    return script(
        f"""
    #!/bin/bash 
    {juicebox} dump observed KR {hic_file} {chromosome} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.KRobserved
    gzip -f {tmpdir}/chr{chromosome}.KRobserved
    {juicebox} dump norm KR {hic_file} {chromosome} BP {resolution} {tmpdir}/chr{chromosome}.KRnorm
    gzip -f {tmpdir}/chr{chromosome}.KRnorm
    """,
        outputs=[Dir(output_dir).stage(Dir(tmpdir))],
    )
Ejemplo n.º 6
0
def make_tss_region_file(genes, output_dir, tmpdir, sizes, tss_slop=500):
    # Given a gene file, define 1kb regions around the tss of each gene
    sizes_pr = df_to_pyranges(
        pd.read_csv(sizes + ".bed", sep="\t", header=None).rename(columns={
            0: "chr",
            1: "start",
            2: "end"
        }))
    tss1kb = genes.loc[:, ["chr", "start", "end", "name", "score", "strand"]]
    tss1kb["start"] = genes["tss"]
    tss1kb["end"] = genes["tss"]
    tss1kb = df_to_pyranges(tss1kb).slack(tss_slop)
    tss1kb = pr.gf.genome_bounds(tss1kb, sizes_pr).df[[
        "Chromosome", "Start", "End", "name", "score", "strand"
    ]]
    tss1kb.columns = ["chr", "start", "end", "name", "score", "strand"]
    tss1kb.sort_values(["chr", "start", "end"])
    tss1kb_file = os.path.join(tmpdir, "GeneList.TSS1kb.bed")
    tss1kb.to_csv(tss1kb_file, header=False, index=False, sep="\t")

    local_chrom_sizes = join(tmpdir, basename(sizes))
    tss1kb_out_file = join(output_dir, "GeneList.TSS1kb.bed")

    return script(
        f"""
        bedtools sort -faidx {local_chrom_sizes} -i {tss1kb_file} > {tss1kb_file}.sorted; mv {tss1kb_file}.sorted {tss1kb_file}
        """,
        inputs=[
            File(join(output_dir,
                      basename(sizes))).stage(File(local_chrom_sizes))
        ],
        outputs={
            "file": File(tss1kb_out_file).stage(tss1kb_file),
            "path": tss1kb_out_file,
            "df": tss1kb,
        },
    )
Ejemplo n.º 7
0
def count_tagalign(tagalign: str, bed_file: str, output: str, chrom_sizes: str,
                   tmpdir: str):
    return script("""
    #!/bin/bash
    tabix -B {tagalign} {bed_file} | cut -f1-3 |bedtools coverage -counts -b stdin -a {bed_file} | awk '{{print $1"\t"$2"\t" $3"\t"$NF}}'>{output}"""
                  )
Ejemplo n.º 8
0
def make_candidate_regions_from_peaks(
    count_file: str,
    macs_peaks: str,
    chrom_sizes: str,
    output_dir: str,
    tmpdir: str,
    n_enhancers: int = 175000,
    regions_whitelist: str = None,
    regions_blacklist: str = None,
    peak_extend: int = 250,
    minPeakWidth: int = 500,
):
    makedirs(output_dir)
    makedirs(tmpdir)
    outfile = join(output_dir, basename(macs_peaks) + ".candidateRegions.bed")

    # get tmpdir local files
    local_macs_peaks = join(tmpdir, basename(macs_peaks))
    local_chrom_sizes = join(tmpdir, basename(chrom_sizes))
    local_regions_whitelist = join(tmpdir, basename(regions_whitelist))
    local_regions_blacklist = join(tmpdir, basename(regions_blacklist))
    local_outfile = join(tmpdir, basename(outfile))
    local_count_file = join(tmpdir, basename(count_file))

    ## Generate enhancer regions from MACS narrowPeak - do not use summits
    if regions_whitelist:
        whitelist_command = ("(bedtools instersect -a " +
                             local_regions_whitelist + " -b " +
                             local_chrom_sizes_bed +
                             " -wa | cut -f1-3 && cat ) | ")
    else:
        whitelist_command = ""

    if regions_blacklist:
        blacklist_command = ("bedtools intersect -v -wa -a stdin -b " +
                             local_regions_blacklist + " | ")
    else:
        blacklist_command = ""

    # 2. Take top N regions, extend peaks (min size 500), merge, remove blacklist, add whitelist, sort and merge
    # use -sorted in intersect command? Not worth it, both files are small
    return script(
        f"""
    #!/bin/bash
    bedtools sort -i {local_count_file} -faidx {local_chrom_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} | \
    bedtools intersect -b stdin -a {local_macs_peaks} -wa | \
    bedtools slop -i stdin -b {peak_extend} -g {local_chrom_sizes} | \
    awk '{{ l=$3-$2; if (l < {minPeakWidth}) {{ $2 = $2 - int(({minPeakWidth}-l)/2); $3 = $3 + int(({minPeakWidth}-l)/2) }} print $1"\t"$2"\t"$3}}' | \
    bedtools sort -i stdin -faidx {local_chrom_sizes} | \
    bedtools merge -i stdin | \
    blacklist_command \
    cut -f 1-3 | {local_whitelist_command} \
    bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools merge -i stdin > {local_outfile}
    """,
        inputs=[
            File(counts_file).stage(File(local_count_file)),
            File(chrom_sizes).stage(File(local_chrom_sizes)),
            File(macs_peaks).stage(File(local_macs_peaks)),
            File(regions_whitelist).stage(File(local_regions_whitelist)),
            File(regions_blacklist).stage(File(local_regions_blacklist)),
        ],
        outputs={
            "candidate_enhancer_regions_file":
            File(outfile).stage(local_outfile),
            "candidate_enhancer_regions_path": outfile,
        },
    )
Ejemplo n.º 9
0
def make_candidate_regions_from_summits(
    count_file: str,
    macs_peaks: str,
    chrom_sizes: str,
    output_dir: str,
    tmpdir: str,
    regions_whitelist: str = None,
    regions_blacklist: str = None,
    n_enhancers: int = 175000,
    peak_extend: int = 250,
):
    ## Generate enhancer regions from MACS summits
    # 1. Count reads in dhs peaks
    # 2. Take top N regions, get summits, extend summits, merge
    makedirs(output_dir)
    makedirs(tmpdir)
    outfile = join(output_dir, basename(macs_peaks) + ".candidateRegions.bed")
    chrom_sizes_bed = ".".join([chrom_sizes, "bed"])
    # get tmpdir local files
    local_macs_peaks = join(tmpdir, basename(macs_peaks))
    local_chrom_sizes = join(tmpdir, basename(chrom_sizes))
    local_chrom_sizes_bed = ".".join([local_chrom_sizes, "bed"])
    local_regions_whitelist = join(tmpdir, basename(regions_whitelist))
    local_regions_blacklist = join(tmpdir, basename(regions_blacklist))
    local_outfile = join(tmpdir, basename(outfile))
    local_count_file = join(tmpdir, basename(count_file))
    if regions_whitelist:
        whitelist_command = ("( bedtools intersect -a " +
                             local_regions_whitelist + " -b " +
                             local_chrom_sizes_bed +
                             " -wa | cut -f1-3 && cat ) | ")
    else:
        whitelist_command = ""

    if regions_blacklist:
        blacklist_command = ("bedtools intersect -v -wa -a stdin -b " +
                             local_regions_blacklist + " | ")
    else:
        blacklist_command = ""

    # 2. Take top N regions, get summits, extend summits, merge, remove blacklist, add whitelist, sort and merge
    # use -sorted in intersect command? Not worth it, both files are small
    return script(
        f"""
    #!/bin/bash
    bedtools sort -i {local_count_file} -faidx {local_chrom_sizes} | bedtools merge -i stdin -c 4 -o max | sort -nr -k 4 | head -n {n_enhancers} | \
    bedtools intersect -b stdin -a {local_macs_peaks} -wa | \
    awk '{{print $1"\t"$2 + $10"\t"$2 + $10}}' | \
    bedtools slop -i stdin -b {peak_extend} -g {local_chrom_sizes} | \
    bedtools sort -i stdin -faidx {local_chrom_sizes} | \
    bedtools merge -i stdin |  \
    {blacklist_command} \
    cut -f 1-3 | {whitelist_command}  \
    bedtools sort -i stdin -faidx {local_chrom_sizes} | bedtools merge -i stdin > {local_outfile}
    """,
        inputs=[
            File(count_file).stage(File(local_count_file)),
            File(chrom_sizes).stage(File(local_chrom_sizes)),
            File(chrom_sizes_bed).stage(File(local_chrom_sizes_bed)),
            File(macs_peaks).stage(File(local_macs_peaks)),
            File(regions_whitelist).stage(File(local_regions_whitelist)),
            File(regions_blacklist).stage(File(local_regions_blacklist)),
        ],
        outputs={
            "candidate_enhancer_regions_file":
            File(outfile).stage(local_outfile),
            "candidate_enhancer_regions_path": outfile,
        },
    )