Ejemplo n.º 1
def gatk_realigner(align_bam, ref_file, config, dbsnp=None, region=None,
                   out_file=None, deep_coverage=False):
    """Realign a BAM file around indels using GATK, returning sorted BAM.
    runner = broad.runner_from_config(config)
    bam.index(align_bam, config)
    runner.run_fn("picard_index_ref", ref_file)
    if region:
        align_bam = subset_bam_by_region(align_bam, region, out_file)
        bam.index(align_bam, config)
    if has_aligned_reads(align_bam, region):
        variant_regions = config["algorithm"].get("variant_regions", None)
        realign_target_file = gatk_realigner_targets(runner, align_bam,
                                                     ref_file, dbsnp, region,
                                                     out_file, deep_coverage,
        realign_bam = gatk_indel_realignment(runner, align_bam, ref_file,
                                             realign_target_file, region,
                                             out_file, deep_coverage)
        # No longer required in recent GATK (> Feb 2011) -- now done on the fly
        # realign_sort_bam = runner.run_fn("picard_fixmate", realign_bam)
        return realign_bam
    elif out_file:
        shutil.copy(align_bam, out_file)
        return out_file
        return align_bam
Ejemplo n.º 3
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file,
    """Retrieve summary of coverage in a region.
    Requires positive non-zero mapping quality at a position, matching GATK's
    CallableLoci defaults.
    out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            fai_file = ref.fasta_idx(ref_file, data["config"])
            sambamba = config_utils.get_program("sambamba", data["config"])
            bedtools = config_utils.get_program("bedtools", data["config"])
            cmd = (
                "{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | "
                "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} "
                "> {tx_out_file}")
                   "bedtools genomecov: %s" % (str(region)), data)
    # Empty output file, no coverage for the whole contig
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                for feat in get_ref_bedtool(ref_file, data["config"], region):
                    out_handle.write("%s\t%s\t%s\t%s\n" %
                                     (feat.chrom, feat.start, feat.end, 0))
    return out_file
Ejemplo n.º 4
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" %
                                   (max_distance, gene_index))
                sort_cmd = bedutils.get_sort_cmd()
                cmd = ("{sort_cmd} -k1,1 -k2,2n {in_file} | "
                       "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
                       "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
                       "{distance_filter} | cut -f 1-{max_column} | "
                       "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate BED file with gene info")
        return out_file
        return in_file
Ejemplo n.º 6
def ref_file_from_bam(bam_file, data):
    """Subset a fasta input file to only a fraction of input contigs.
    new_ref = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "inputs", "ref")),
                           "%s-subset.fa" % dd.get_genome_build(data))
    if not utils.file_exists(new_ref):
        with file_transaction(data, new_ref) as tx_out_file:
            contig_file = "%s-contigs.txt" % utils.splitext_plus(new_ref)[0]
            with open(contig_file, "w") as out_handle:
                for contig in [x.contig for x in idxstats(bam_file, data) if x.contig != "*"]:
                    out_handle.write("%s\n" % contig)
            cmd = "seqtk subseq -l 100 %s %s > %s" % (dd.get_ref_file(data), contig_file, tx_out_file)
            do.run(cmd, "Subset %s to BAM file contigs" % dd.get_genome_build(data))
    ref.fasta_idx(new_ref, data["config"])
    runner = broad.runner_from_path("picard", data["config"])
    runner.run_fn("picard_index_ref", new_ref)
    return {"base": new_ref}
Ejemplo n.º 7
def get_padded_bed_file(out_dir, bed_file, padding, data):
    out_file = os.path.join(out_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0]))
    if utils.file_uptodate(out_file, bed_file):
        return out_file
    fai_file = ref.fasta_idx(dd.get_ref_file(data))
    with file_transaction(data, out_file) as tx_out_file:
        cmd = "bedtools slop -i {bed_file} -g {fai_file} -b {padding} > {tx_out_file}"
        do.run(cmd.format(**locals()), "Pad BED file", data)
    return out_file
Ejemplo n.º 8
def get_padded_bed_file(out_dir, bed_file, padding, data):
    bedtools = config_utils.get_program("bedtools", data, default="bedtools")
    out_file = os.path.join(out_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0]))
    if utils.file_uptodate(out_file, bed_file):
        return out_file
    fai_file = ref.fasta_idx(dd.get_ref_file(data))
    with file_transaction(data, out_file) as tx_out_file:
        cmd = "{bedtools} slop -i {bed_file} -g {fai_file} -b {padding} | bedtools merge -i - > {tx_out_file}"
        do.run(cmd.format(**locals()), "Pad BED file", data)
    return out_file
Ejemplo n.º 9
def _subset_bed_by_region(in_file, out_file, regions, ref_file, do_merge=True):
    orig_bed = pybedtools.BedTool(in_file)
    region_bed = pybedtools.BedTool("\n".join(["%s\t%s\t%s" % (c, s, e) for c, s, e in regions]) + "\n",
    sort_kwargs = {"faidx": ref.fasta_idx(ref_file)} if ref_file else {}
    if do_merge:
        orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
            filter(lambda x: len(x) > 1).saveas().merge().saveas(out_file)
        orig_bed.intersect(region_bed, nonamecheck=True).saveas().sort(**sort_kwargs).saveas().\
            filter(lambda x: len(x) > 1).saveas(out_file)
Ejemplo n.º 10
def get_padded_bed_file(bed_file, padding, data, bedprep_dir=None):
    if not bedprep_dir:
        bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep"))
    out_file = os.path.join(bedprep_dir, "%s-padded.bed" % (utils.splitext_plus(os.path.basename(bed_file))[0]))
    if utils.file_uptodate(out_file, bed_file):
        return out_file
    fai_file = ref.fasta_idx(dd.get_ref_file(data))
    with file_transaction(data, out_file) as tx_out_file:
        cmd = "bedtools slop -i {bed_file} -g {fai_file} -b {padding} > {tx_out_file}"
        do.run(cmd.format(**locals()), "Pad BED file", data)
    return out_file
Ejemplo n.º 13
def _prep_callable_bed(in_file, work_dir, stats, data):
    """Sort and merge callable BED regions to prevent SV double counting
    out_file = os.path.join(work_dir, "%s-merge.bed.gz" % utils.splitext_plus(os.path.basename(in_file))[0])
    gsort = config_utils.get_program("gsort", data)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            cmd = ("{gsort} {in_file} {fai_file} | bedtools merge -i - -d {stats[merge_size]} | "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prepare SV callable BED regions")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Ejemplo n.º 14
def split_vcf(in_file, ref_file, config, out_dir=None):
    """Split a VCF file into separate files by chromosome.
    if out_dir is None:
        out_dir = os.path.join(os.path.dirname(in_file), "split")
    out_files = []
    with open(ref.fasta_idx(ref_file, config)) as in_handle:
        for line in in_handle:
            chrom, size = line.split()[:2]
            out_file = os.path.join(out_dir,
                                    os.path.basename(replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf")))
            subset_vcf(in_file, (chrom, 0, size), out_file, config)
    return out_files
Ejemplo n.º 15
def subset_by_genes(in_file, data, out_dir, pad):
    """Subset BED file of regions to only those within pad of the final output.
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    fai_file = ref.fasta_idx(dd.get_ref_file(data))
    if not gene_file or not utils.file_exists(in_file):
        return in_file
        out_file = os.path.join(out_dir, "%s-geneonly.bed" % utils.splitext_plus(os.path.basename(in_file))[0])
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(data, out_file) as tx_out_file:
                want_region_file = "%s-targetregions%s" % utils.splitext_plus(out_file)
                pybedtools.BedTool(gene_file).slop(g=fai_file, b=pad).merge().saveas(want_region_file)
        return out_file
Ejemplo n.º 16
def add_genes(in_file, data, max_distance=10000, work_dir=None):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file))
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if work_dir:
            out_file = os.path.join(work_dir, os.path.basename(out_file))
        if not utils.file_uptodate(out_file, in_file):
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                _add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, data, max_distance)
        return out_file
        return in_file
Ejemplo n.º 18
def split_vcf(in_file, ref_file, config, out_dir=None):
    """Split a VCF file into separate files by chromosome.
    if out_dir is None:
        out_dir = os.path.join(os.path.dirname(in_file), "split")
    out_files = []
    with open(ref.fasta_idx(ref_file, config)) as in_handle:
        for line in in_handle:
            chrom, size = line.split()[:2]
            out_file = os.path.join(
                    replace_suffix(append_stem(in_file, "-%s" % chrom),
            subset_vcf(in_file, (chrom, 0, size), out_file, config)
    return out_files
Ejemplo n.º 19
def add_genes(in_file, data, max_distance=10000):
    """Add gene annotations to a BED file from pre-prepared RNA-seq data.

    max_distance -- only keep annotations within this distance of event
    gene_file = regions.get_sv_bed(data,
    if gene_file and utils.file_exists(in_file):
        out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0]
        if not utils.file_uptodate(out_file, in_file):
            input_rec = iter(pybedtools.BedTool(in_file)).next()
            # keep everything after standard chrom/start/end, 1-based
            extra_fields = range(4, len(input_rec.fields) + 1)
            # keep the new gene annotation
            gene_index = len(input_rec.fields) + 4
            columns = ",".join([str(x) for x in extra_fields])
            max_column = max(extra_fields) + 1
            ops = ",".join(["distinct"] * len(extra_fields))
            fai_file = ref.fasta_idx(dd.get_ref_file(data))
            with file_transaction(data, out_file) as tx_out_file:
                # swap over gene name to '.' if beyond maximum distance
                # cut removes the last distance column which can cause issues
                # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string'
                distance_filter = (
                    r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'"""
                    % (max_distance, gene_index))
                sort_cmd = bedutils.get_sort_cmd()
                cmd = (
                    "{sort_cmd} -k1,1 -k2,2n {in_file} | "
                    "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) "
                    "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {gene_file}) | "
                    "{distance_filter} | cut -f 1-{max_column} | "
                    "bedtools merge -i - -c {columns} -o {ops} -delim ',' > {tx_out_file}"
                       "Annotate BED file with gene info")
        return out_file
        return in_file
