def variants(data):
    if not "vrn_file" in  data:
        return data
    in_vcf = data['vrn_file']
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = data['work_bam']
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        jvm_opts = broad.get_gatk_framework_opts(data['config'])
        gatk_jar = config_utils.get_program("gatk", data['config'], "dir")
        bed_file = dd.get_variant_regions(data)
        sample = splitext_plus(os.path.basename(in_vcf))[0]
        in_bam = data["work_bam"]
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_cg-depth-parse.tsv")
        if not file_exists(cg_file):
            with file_transaction(cg_file) as tx_out:
                cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} "
                       "-L {bed_file} -I {in_bam} "
                       "-A GCContent --variant {in_vcf} --out {tx_out}")
                do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf)

        if not file_exists(parse_file):
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    print >>out_handle, "CG\tdepth\tsample"
                cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R  {bed_file} {cg_file} >> {out_tx}")
                do.run(cmd.format(**locals()), " query for %s" % in_vcf)
                logger.debug('parsing coverage: %s' % sample)
        # return df
        return data
Example #2
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, data,
          names=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    config = data["config"]
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not utils.file_exists(sam_file):
        if not utils.file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not utils.file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config_utils.get_program("bwa", config), align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file)
            do.run(cmd, "bwa {align_type}".format(**locals()), None)
    return sam_file
Example #3
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with utils.curdir_tmpdir() as tmpdir:
            with file_transaction(sr_file) as tx_sr_file:
                with file_transaction(disc_file) as tx_disc_file:
                    with file_transaction(dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | "
                               "{samtools} view -h - | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Example #4
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform a BWA alignment, generating a SAM file.
    """
    assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa"
    config = data["config"]
    sai1_file = os.path.join(align_dir, "%s_1.sai" % names["lane"])
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % names["lane"])
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % names["lane"])
    if not utils.file_exists(sam_file):
        if not utils.file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not utils.file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        rg_info = novoalign.get_rg_info(names)
        sam_cl = [config_utils.get_program("bwa", config), align_type, "-r", "'%s'" % rg_info,
                  ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            cmd = "{cl} > {out_file}".format(cl=" ".join(sam_cl), out_file=tx_sam_file)
            do.run(cmd, "bwa {align_type}".format(**locals()), None)
    return sam_file
Example #5
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Example #6
0
File: bwa.py Project: vals/bcbb
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config,
          rg_name=None):
    """Perform a BWA alignment, generating a SAM file.
    """
    sai1_file = os.path.join(align_dir, "%s_1.sai" % out_base)
    sai2_file = (os.path.join(align_dir, "%s_2.sai" % out_base)
                 if pair_file else None)
    sam_file = os.path.join(align_dir, "%s.sam" % out_base)
    if not file_exists(sam_file):
        if not file_exists(sai1_file):
            with file_transaction(sai1_file) as tx_sai1_file:
                _run_bwa_align(fastq_file, ref_file, tx_sai1_file, config)
        if sai2_file and not file_exists(sai2_file):
            with file_transaction(sai2_file) as tx_sai2_file:
                _run_bwa_align(pair_file, ref_file, tx_sai2_file, config)
        align_type = "sampe" if sai2_file else "samse"
        sam_cl = [config["program"]["bwa"], align_type, ref_file, sai1_file]
        if sai2_file:
            sam_cl.append(sai2_file)
        sam_cl.append(fastq_file)
        if sai2_file:
            sam_cl.append(pair_file)
        with file_transaction(sam_file) as tx_sam_file:
            with open(tx_sam_file, "w") as out_handle:
                logger.info(" ".join(sam_cl))
                subprocess.check_call(sam_cl, stdout=out_handle)
    return sam_file
Example #7
0
def _segment_normalized_gatk(cnr_file, work_dir, paired):
    """Segmentation of normalized inputs using GATK4, converting into standard input formats.
    """
    work_dir = utils.safe_makedir(os.path.join(work_dir, "gatk-cnv"))
    seg_file = gatkcnv.model_segments(cnr_file, work_dir, paired)["seg"]
    std_seg_file = seg_file.replace(".cr.seg", ".seg")
    if not utils.file_uptodate(std_seg_file, seg_file):
        with file_transaction(std_seg_file) as tx_out_file:
            df = pd.read_csv(seg_file, sep="\t", comment="@", header=0,
                             names=["chrom", "loc.start", "loc.end", "num.mark", "seg.mean"])
            df.insert(0, "ID", [dd.get_sample_name(paired.tumor_data)] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    std_cnr_file = os.path.join(work_dir, "%s.cnr" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_uptodate(std_cnr_file, cnr_file):
        with file_transaction(std_cnr_file) as tx_out_file:
            logdf = pd.read_csv(cnr_file, sep="\t", comment="@", header=0,
                                names=["chrom", "start", "end", "log2"])
            covdf = pd.read_csv(tz.get_in(["depth", "bins", "antitarget"], paired.tumor_data),
                                sep="\t", header=None,
                                names=["chrom", "start", "end", "orig.name", "depth", "gene"])
            df = pd.merge(logdf, covdf, on=["chrom", "start", "end"])
            del df["orig.name"]
            df = df[["chrom", "start", "end", "gene", "log2", "depth"]]
            df.insert(6, "weight", [1.0] * len(df))
            df.to_csv(tx_out_file, sep="\t", header=True, index=False)
    return std_cnr_file, std_seg_file
Example #8
0
def gtf_to_fasta(gtf, ref_fasta, cds=False, out_file=None):
    """
    convert a GTF to FASTA format if cds=True, use the start/stop codons
    to output only the CDS
    """
    if out_file and file_exists(out_file):
        return out_file

    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False, suffix=".fa").name

    tmp_file = out_file + ".tmp"
    if cds:
        cmd = "gffread -g {ref_fasta} -x {tx_tmp_file} {gtf}"
    else:
        cmd = "gffread -g {ref_fasta} -w {tx_tmp_file} {gtf}"
    message = "Converting %s to FASTA format." % gtf
    with file_transaction(tmp_file) as tx_tmp_file:
        do.run(cmd.format(**locals()), message)

    with file_transaction(out_file) as tx_out_file:
        with open(tmp_file) as in_handle, open(tx_out_file, "w") as out_handle:
            for line in in_handle:
                if line.startswith(">"):
                    line = line.split()[0] + "\n"
                out_handle.write(line)
    return out_file
Example #9
0
def _add_variantcalls_to_output(out, data):
    """Call ploidy and convert into VCF and BED representations.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = dd.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call",
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_call_file, out["cns"]]
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    calls = {}
    for outformat in ["bed", "vcf"]:
        out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat)
        calls[outformat] = out_file
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                       outformat, "--sample-id", dd.get_sample_name(data),
                       "--ploidy", str(dd.get_ploidy(data)),
                       "-o", tx_out_file, call_file]
                if gender and gender.lower() == "male":
                    cmd += ["--male-reference"]
                do.run(cmd, "CNVkit export %s" % outformat)
    out["call_file"] = call_file
    out["vrn_bed"] = annotate.add_genes(calls["bed"], data)
    effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff")
    out["vrn_file"] = effects_vcf or calls["vcf"]
    return out
Example #10
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files, log_file, config):
    """Trimming with cutadapt, using version installed with bcbio-nextgen.

    Uses the system executable to find the version next to our Anaconda Python.
    TODO: Could we use cutadapt as a library to avoid this?
    """
    if all([file_exists(x) for x in out_files]):
        return out_files
    cmd = _cutadapt_trim_cmd(fastq_files, quality_format, adapters, out_files)
    if len(fastq_files) == 1:
        of = [out_files[0], log_file]
        message = "Trimming %s in single end mode with cutadapt." % (fastq_files[0])
        with file_transaction(config, of) as of_tx:
            of1_tx, log_tx = of_tx
            do.run(cmd.format(**locals()), message)
    else:
        of = out_files + [log_file]
        with file_transaction(config, of) as tx_out_files:
            of1_tx, of2_tx, log_tx = tx_out_files
            tmp_fq1 = append_stem(of1_tx, ".tmp")
            tmp_fq2 = append_stem(of2_tx, ".tmp")
            singles_file = of1_tx + ".single"
            message = "Trimming %s and %s in paired end mode with cutadapt." % (fastq_files[0],
                                                                                fastq_files[1])
            do.run(cmd.format(**locals()), message)
    return out_files
Example #11
0
def _add_bed_to_output(out, data):
    """Call ploidy and convert into BED representation.
    """
    call_file = "%s-call%s" % os.path.splitext(out["cns"])
    gender = dd.get_gender(data)
    if not utils.file_exists(call_file):
        with file_transaction(data, call_file) as tx_call_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call",
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_call_file, out["cns"]]
            if gender:
                cmd += ["--gender", gender]
                if gender.lower() == "male":
                    cmd += ["--male-reference"]
            do.run(cmd, "CNVkit call ploidy")
    out_file = "%s.bed" % os.path.splitext(call_file)[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export",
                   "bed", "--sample-id", dd.get_sample_name(data),
                   "--ploidy", str(dd.get_ploidy(data)),
                   "-o", tx_out_file, call_file]
            if gender and gender.lower() == "male":
                cmd += ["--male-reference"]
            do.run(cmd, "CNVkit export BED")
    out["call_file"] = call_file
    out["vrn_file"] = annotate.add_genes(out_file, data)
    return out
Example #12
0
def cram_compress(in_bam):
    import os
    import subprocess
    from bcbio import utils
    from bcbio.distributed.transaction import file_transaction
    print in_bam
    ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa"
    out_file = "%s.cram" % os.path.splitext(in_bam)[0]
    jvm_opts = "-Xms1g -Xmx3g"
    if not utils.file_exists(out_file):
        print "cramming", out_file
        with file_transaction(out_file) as tx_out_file:
            cmd = ("cramtools {jvm_opts} cram "
                   "--input-bam-file {in_bam} "
                   "--capture-all-tags "
                   "--ignore-tags 'BD:BI' "
                   "--reference-fasta-file {ref_file} "
                   "--lossy-quality-score-spec '*8' "
                   "--output-cram-file {tx_out_file}")
            subprocess.check_call(cmd.format(**locals()), shell=True)
    if not utils.file_exists(out_file + ".crai"):
        print "indexing", out_file + ".crai"
        with file_transaction(out_file + ".crai") as tx_out_file:
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(out_file, tx_in_file)
            cmd = ("cramtools {jvm_opts} index "
                   "--input-file {tx_in_file}")
            subprocess.check_call(cmd.format(**locals()), shell=True)
    if os.path.exists(in_bam) and utils.file_exists(out_file):
        if in_bam != out_file and in_bam.endswith(".bam") and out_file.endswith(".cram"):
            os.remove(in_bam)
    return out_file
Example #13
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samblaster = config_utils.get_program("samblaster", data["config"])
    sambamba = config_utils.get_program("sambamba", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease")
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file):
        with file_transaction(sr_file) as tx_sr_file:
            with file_transaction(disc_file) as tx_disc_file:
                with utils.curdir_tmpdir() as tmpdir:
                    tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | "
                                 "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} "
                                 "-o {out_file} /dev/stdin")
                    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals())
                    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals())
                    cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} "
                           "-n -o /dev/stdout -l 0 {in_bam} | "
                           "{sambamba} view -h /dev/stdin | "
                           "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
                           "-o /dev/null")
                    do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data)
    return sr_file, disc_file
Example #14
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}")
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Example #15
0
def calc_variants_stats(data, args):
    in_vcf = data['vcf']
    ref_file = args.reference
    # gatk_jar = '/groups/bcbio/bcbio/toolplus/gatk/3.2-2-gec30cee/GenomeAnalysisTK.jar'
    jvm_opts = broad.get_gatk_framework_opts(data['config'])
    gatk_jar = config_utils.get_program("gatk", data['config'], "dir")
    bed_file = args.region
    sample = splitext_plus(op.basename(in_vcf))[0]
    in_bam = data['bam']
    cg_file = op.join(args.out, sample + "_with-gc.vcf.gz")
    parse_file = op.join(args.out, sample + "_cg-depth-parse.tsv")
    if not file_exists(cg_file):
        with file_transaction(cg_file) as tx_out:
            cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} "
                   "-L {bed_file} -I {in_bam} "
                   "-A GCContent --variant {in_vcf} --out {tx_out}")
            do.run(cmd.format(**locals()), " cg for %s" % in_vcf)

    if not file_exists(parse_file):
        with file_transaction(parse_file) as out_tx:
            with open(out_tx, 'w') as out_handle:
                print >>out_handle, "CG\tdepth\tsample"
            cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R  {bed_file} {cg_file} >> {out_tx}")
            do.run(cmd.format(**locals()), " query for %s" % in_vcf)
            logger.info('parsing coverage: %s' % sample)
    # return df
    return parse_file
Example #16
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Example #17
0
def calculate_tstv(args):
    """
    get tstv from bcftools stat for all, known and new variants
    """
    tstv = defaultdict(list)
    for in_vcf in args.bams:
        out_file = os.path.splitext(in_vcf)[0] + ".stats"
        known_file = os.path.splitext(in_vcf)[0] + ".known.stats"
        new_file = os.path.splitext(in_vcf)[0] + ".new.stats"
        sample = os.path.basename(in_vcf).split("-")[0]
        if not file_exists(out_file):
            with file_transaction(out_file) as tx_out:
                cmd = ("bcftools stats {in_vcf} > {tx_out}")
                do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf)
        if not file_exists(new_file):
            with file_transaction(new_file) as tx_new:
                cmd = ("bcftools filter -i DB=0 {in_vcf} | bcftools stats /dev/stdin > {tx_new}")
                do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf)
        if not file_exists(known_file):
            with file_transaction(known_file) as tx_known:
                cmd = ("bcftools filter -i DB=1 {in_vcf} | bcftools stats /dev/stdin > {tx_known}")
                do.run(cmd.format(**locals()), "ts/tv ratio for %s" % in_vcf)
        for fn, name in zip([out_file, known_file, new_file], ['all', 'known', 'new']):
            with open(fn) as in_handle:
                for line in in_handle:
                    if line.startswith("TSTV"):
                        tstv[sample].append(line.split()[4])
                        break
    df = pd.DataFrame(tstv, index=['all', 'known', 'new'])
    df.to_csv(args.out)
Example #18
0
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data):
    """Create target and antitarget regions from target and access files.
    """
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    basename = os.path.splitext(os.path.basename(raw_target_bed))[0]
    target_bed = os.path.join(work_dir, "%s-%s.target.bed" % (basename, batch))
    # back compatible with previous runs to avoid re-calculating
    target_bed_old = os.path.join(work_dir, "%s.target.bed" % basename)
    if utils.file_exists(target_bed_old):
        target_bed = target_bed_old
    if not utils.file_exists(target_bed):
        with file_transaction(data, target_bed) as tx_out_file:
            cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file]
            bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data)
            if bin_estimates.get("target"):
                cmd += ["--avg-size", str(bin_estimates["target"])]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit target")
    antitarget_bed = os.path.join(work_dir, "%s-%s.antitarget.bed" % (basename, batch))
    antitarget_bed_old = os.path.join(work_dir, "%s.antitarget.bed" % basename)
    # back compatible with previous runs to avoid re-calculating
    if os.path.exists(antitarget_bed_old):
        antitarget_bed = antitarget_bed_old
    if not os.path.exists(antitarget_bed):
        with file_transaction(data, antitarget_bed) as tx_out_file:
            cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file]
            bin_estimates = _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data)
            if bin_estimates.get("antitarget"):
                cmd += ["--avg-size", str(bin_estimates["antitarget"])]
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit antitarget")
    return target_bed, antitarget_bed
Example #19
0
def _bgzip_from_cram(cram_file, dirs, data):
    """Create bgzipped fastq files from an input CRAM file in regions of interest.

    Returns a list with a single file, for single end CRAM files, or two
    files for paired end input.
    """
    region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data)
                   if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"]
                   else None)
    if region_file:
        regions = ["%s:%s-%s" % tuple(r) for r in pybedtools.BedTool(region_file)]
    else:
        regions = [None]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep"))
    out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s-%s.fq.gz" %
                                          (utils.splitext_plus(os.path.basename(cram_file))[0], fext))
                             for fext in ["s1", "p1", "p2"]]
    if not utils.file_exists(out_s) and not utils.file_exists(out_p1):
        cram.index(cram_file)
        fastqs = _cram_to_fastq_regions(regions, cram_file, dirs, data)
        if len(fastqs[0]) == 1:
            with file_transaction(out_s) as tx_out_file:
                _merge_and_bgzip([xs[0] for xs in fastqs], tx_out_file, out_s)
        else:
            for i, out_file in enumerate([out_p1, out_p2]):
                ext = "/%s" % (i + 1)
                with file_transaction(out_file) as tx_out_file:
                    _merge_and_bgzip([xs[i] for xs in fastqs], tx_out_file, out_file, ext)
    if utils.file_exists(out_p1):
        return [out_p1, out_p2]
    else:
        assert utils.file_exists(out_s)
        return [out_s]
Example #20
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = "bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}"
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                cmd = "simple_sv_annotation.py -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(simple_vcf, data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = (
                "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                """'{{if (($7 == "PASS" || $7 == ".")) """
                "print CALLER,SNAME,$1,$2,I$END,I$SVTYPE,I$KNOWN,I$LOF,I$SIMPLE_ANN,"
                "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Example #21
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Example #22
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        _rename_allelic_fraction_field(out_file_mutect,config)
        disable_SID = True # SID isn't great, so use Scalpel instead
        if "appistry" not in broad_runner.get_mutect_version() or disable_SID:
            # Scalpel InDels
            is_paired = "-I:normal" in params
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(out_file_indels) as tx_out_file2:
                    if not is_paired:
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        else:
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Example #23
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = ["-T", "VariantAnnotator",
                              "-R", ref_file,
                              "-L", cleaned_bed,
                              "-I", in_bam,
                              "-A", "GCContent",
                              "-A", "Coverage",
                              "--variant", in_vcf,
                              "--out", tx_out]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >>out_handle, "CG\tdepth\tsample"
                    cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                            "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                            "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Example #24
0
def trim_adapters(fastq_files, dirs, config):
    QUALITY_CUTOFF = 5
    to_trim = _get_sequences_to_trim(config, ALIENTRIMMER_ADAPTERS)
    resources = config_utils.get_resources("AlienTrimmer", config)
    try:
        jarpath = config_utils.get_program("AlienTrimmer", config, "dir")
    # fall back on Cutadapt if AlienTrimmer is not installed
    # XXX: remove after it has been live for a while
    except:
        return trim_read_through(fastq_files, dirs, config)
    jarfile = config_utils.get_jar("AlienTrimmer", jarpath)
    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]))
    base_cmd = ("java -jar {jvm_opts} {jarfile} -k 10 -l 20 ")
    fastq1 = fastq_files[0]
    supplied_quality_format = _get_quality_format(config)
    cores = config["algorithm"].get("num_cores", 0)
    out_files = _get_read_through_trimmed_outfiles(fastq_files, dirs)
    fastq1_out = out_files[0]
    if supplied_quality_format == "illumina":
        quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][0]
    else:
        quality_flag = QUALITY_FLAGS[QUALITY_CUTOFF][1]
    quality_flag = '-q ' + quality_flag
    if len(fastq_files) == 1:
        if file_exists(fastq1_out):
            return [fastq1_out]
        base_cmd += ("-i {fastq1} -o {tx_fastq1_out} -c {temp_file} "
                     "{quality_flag}")
        message = "Trimming %s from %s with AlienTrimmer." % (to_trim, fastq1)
    else:
        fastq2 = fastq_files[1]
        fastq2_out = out_files[1]
        if all(map(file_exists, [fastq1_out, fastq2_out])):
            return [fastq1_out, fastq2_out]
        base_cmd += ("-if {fastq1} -ir {fastq2} -of {tx_fastq1_out} "
                     "-or {tx_fastq2_out} -c {temp_file} {quality_flag}")
        message = ("Trimming %s from %s and %s with AlienTrimmer."
                   % (to_trim, fastq1, fastq2))
    with tempfile.NamedTemporaryFile(delete=False) as temp:
        temp_file = temp.name
        for adapter in to_trim:
            temp.write(adapter + "\n")
        temp.close()


    if len(fastq_files) == 1:
        with file_transaction(fastq1_out) as tx_fastq1_out:
            do.run(base_cmd.format(**locals()), message)
        return [fastq1_out]
    else:
        with file_transaction([fastq1_out, fastq2_out]) as tx_out_files:
            tx_fastq1_out = tx_out_files[0]
            tx_fastq2_out = tx_out_files[1]
            do.run(base_cmd.format(**locals()), message)
        return [fastq1_out, fastq2_out]
Example #25
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data["vrn_file"]
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        sample = dd.get_sample_name(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T",
                        "VariantAnnotator",
                        "-R",
                        ref_file,
                        "-L",
                        bed_file,
                        "-I",
                        in_bam,
                        "-A",
                        "GCContent",
                        "-A",
                        "Coverage",
                        "--variant",
                        in_vcf,
                        "--out",
                        tx_out,
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, "w") as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}"
                    )
                    do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug("parsing coverage: %s" % sample)
        return data
Example #26
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = config_utils.get_program("vardict", config)
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
                vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                cmd = ("{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Example #27
0
def merge_bam_files(bam_files, work_dir, config, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    if len(bam_files) == 1:
        bam.index(bam_files[0], config)
        return bam_files[0]
    else:
        if out_file is None:
            out_file = os.path.join(work_dir, os.path.basename(sorted(bam_files)[0]))
        if batch is not None:
            base, ext = os.path.splitext(out_file)
            out_file = "%s-b%s%s" % (base, batch, ext)
        if not utils.file_exists(out_file):
            sambamba = config_utils.get_program("sambamba", config)
            samtools = config_utils.get_program("samtools", config)
            samblaster = config_utils.get_program("samblaster", config)
            resources = config_utils.get_resources("samtools", config)
            num_cores = config["algorithm"].get("num_cores", 1)
            max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                                 2, "decrease").upper()
            # sambamba opens 4 handles per file, so try to guess a reasonable batch size
            batch_size = (system.open_file_limit() // 4) - 100
            if len(bam_files) > batch_size:
                bam_files = [merge_bam_files(xs, work_dir, config, out_file, i)
                             for i, xs in enumerate(utils.partition_all(batch_size, bam_files))]
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(config, out_file) as tx_out_file:
                        with file_transaction(config, "%s.list" % os.path.splitext(out_file)[0]) as tx_bam_file_list:
                            with open(tx_bam_file_list, "w") as out_handle:
                                for f in sorted(bam_files):
                                    out_handle.write("%s\n" % f)
                            if bam.bam_already_sorted(bam_files[0], config, "coordinate"):
                                cmd = _sambamba_merge(bam_files)
                            else:
                                assert config.get("mark_duplicates", True)
                                cmd = _biobambam_merge_dedup()
                            do.run(cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file),
                                   None)
            # Ensure timestamps are up to date on output file and index
            # Works around issues on systems with inconsistent times
            for ext in ["", ".bai"]:
                if os.path.exists(out_file + ext):
                    subprocess.check_call(["touch", out_file + ext])
            for b in bam_files:
                utils.save_diskspace(b, "BAM merged to %s" % out_file, config)
        bam.index(out_file, config)
        return out_file
Example #28
0
def combine_sailfish(samples):
    work_dir = dd.get_in_samples(samples, dd.get_work_dir)
    sailfish_dir = os.path.join(work_dir, "sailfish")
    gtf_file = dd.get_in_samples(samples, dd.get_gtf_file)
    dont_combine, to_combine = partition(dd.get_sailfish,
                                         dd.sample_data_iterator(samples), True)
    if not to_combine:
        return samples

    tidy_file = os.path.join(sailfish_dir, "combined.sf")
    transcript_tpm_file = os.path.join(sailfish_dir, "combined.isoform.sf.tpm")
    gene_tpm_file = os.path.join(sailfish_dir, "combined.gene.sf.tpm")
    tx2gene = os.path.join(sailfish_dir, "tx2gene.csv")
    if not all([file_exists(x) for x in [gene_tpm_file, tidy_file,
                                         transcript_tpm_file, tx2gene]]):
        logger.info("Combining count files into %s." % tidy_file)
        df = pd.DataFrame()
        for data in to_combine:
            sailfish_file = dd.get_sailfish(data)
            samplename = dd.get_sample_name(data)
            new_df = _sailfish_expression_parser(sailfish_file, samplename)
            if df.empty:
                df = new_df
            else:
                df = rbind([df, new_df])
        df["id"] = df.index
        # some versions of the transcript annotations can have duplicated entries
        df = df.drop_duplicates(["id", "sample"])
        with file_transaction(tidy_file) as tx_out_file:
            df.to_csv(tx_out_file, sep="\t", index_label="name")
        with file_transaction(transcript_tpm_file) as  tx_out_file:
            df.pivot("id", "sample", "tpm").to_csv(tx_out_file, sep="\t")
        with file_transaction(gene_tpm_file) as  tx_out_file:
            pivot = df.pivot("id", "sample", "tpm")
            tdf = pd.DataFrame.from_dict(gtf.transcript_to_gene(gtf_file),
                                         orient="index")
            tdf.columns = ["gene_id"]
            pivot = pivot.join(tdf)
            pivot = pivot.groupby("gene_id").agg(np.sum)
            pivot.to_csv(tx_out_file, sep="\t")
        tx2gene = gtf.tx2genefile(gtf_file, tx2gene)
        logger.info("Finished combining count files into %s." % tidy_file)

    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_sailfish_tidy(data, tidy_file)
        data = dd.set_sailfish_transcript_tpm(data, transcript_tpm_file)
        data = dd.set_sailfish_gene_tpm(data, gene_tpm_file)
        data = dd.set_tx2gene(data, tx2gene)
        updated_samples.append([data])
    return updated_samples
Example #29
0
File: trim.py Project: roryk/bipy
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program",
                                                    "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cmd = str(cutadapt.bake(in_file, self.options, adapters,
                                    quality_base=quality_base, out=temp_cut.name))
            do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cmd = str(cutadapt.bake(temp_cut.name, self.options, "-a",
                                        polya, "-a", self._rc_adapters(polya),
                                        quality_base=quality_base, out=temp_out))
                do.run(cmd, "Cutadapt trim of polyA tail of %s." % (temp_cut.name),
                       None)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cmd = str(cutadapt.bake(in_file, self.options, adapters,
                                    out=temp_out))
                do.run(cmd, "Cutadapt trim of %s." % (in_file))
            return out_file
Example #30
0
def _cnvkit_targets(raw_target_bed, access_bed, cov_interval, work_dir, data):
    """Create target and antitarget regions from target and access files.
    """
    target_bed = os.path.join(work_dir, "%s.target.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not utils.file_uptodate(target_bed, raw_target_bed):
        with file_transaction(data, target_bed) as tx_out_file:
            cmd = [_get_cmd(), "target", raw_target_bed, "--split", "-o", tx_out_file]
            do.run(cmd, "CNVkit target")
    antitarget_bed = os.path.join(work_dir, "%s.antitarget.bed" % os.path.splitext(os.path.basename(raw_target_bed))[0])
    if not utils.file_uptodate(antitarget_bed, target_bed):
        with file_transaction(data, antitarget_bed) as tx_out_file:
            cmd = [_get_cmd(), "antitarget", "-g", access_bed, target_bed, "-o", tx_out_file]
            do.run(cmd, "CNVkit antitarget")
    return target_bed, antitarget_bed
Example #31
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                input_beds = filter(lambda x: x is not None,
                                    [_create_bed(c, out_file, data) for c in calls])
                if len(input_beds) > 0:
                    all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(input_beds):
                            out_handle.write(line)
                    pybedtools.BedTool(all_file).sort(stream=True)\
                      .merge(c=4, o="distinct", delim=",").saveas(tx_out_file)
    if utils.file_exists(out_file):
        calls.append({"variantcaller": "ensemble",
                      "vrn_file": out_file})
    return calls
Example #32
0
def _bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools"""
    cmd = ("{bam_coverage} --bam {bam_input} --outFileName {bw_output} "
           "--binSize 20 --effectiveGenomeSize {size} "
           "--smoothLength 60 --extendReads 150 --centerReads -p {cores} ")
    size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    cores = dd.get_num_cores(data)
    try:
        bam_coverage = config_utils.get_program("bamCoverage", data)
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
    resources = config_utils.get_resources("bamCoverage", data["config"])
    if resources:
        options = resources.get("options")
        if options:
            cmd += " %s" % " ".join([str(x) for x in options])
    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
    if utils.file_exists(bw_output):
        return bw_output
    with file_transaction(bw_output) as out_tx:
        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
    return bw_output
Example #33
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(data, out_file) as tx_out_file:
        params = [
            "-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o",
            tx_out_file, "-nct",
            str(num_cores), "--emitRefConfidence", "GVCF",
            "--variant_index_type", "LINEAR", "--variant_index_parameter",
            "128000", "-dontUseSoftClippedBases"
        ]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
Example #34
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file
    out = []
    for data in items:
        sample_file = _select_sample(data, variant_file, work_dir)
        if "sv" not in data:
            data["sv"] = []
        effects_vcf, _ = effects.add_to_vcf(sample_file, data, "snpeff")
        data["sv"].append({"variantcaller": "manta",
                           "vrn_file": effects_vcf or sample_file})
        out.append(data)
    return out
Example #35
0
def picard_mark_duplicates(picard, align_bam, remove_dups=False):
    base, ext = os.path.splitext(align_bam)
    base = base.replace(".", "-")
    dup_bam = "%s-dup%s" % (base, ext)
    dup_metrics = "%s-dup.dup_metrics" % base
    if not file_exists(dup_bam):
        with tx_tmpdir(picard._config) as tmp_dir:
            with file_transaction(picard._config, dup_bam,
                                  dup_metrics) as (tx_dup_bam, tx_dup_metrics):
                opts = [("INPUT", align_bam), ("OUTPUT", tx_dup_bam),
                        ("TMP_DIR", tmp_dir),
                        ("REMOVE_DUPLICATES",
                         "true" if remove_dups else "false"),
                        ("METRICS_FILE", tx_dup_metrics)]
                if picard.get_picard_version("MarkDuplicates") >= 1.82:
                    opts += [("PROGRAM_RECORD_ID", "null")]
                picard.run("MarkDuplicates",
                           opts,
                           memscale={
                               "direction": "decrease",
                               "magnitude": 2
                           })
    return dup_bam, dup_metrics
Example #36
0
def picard_fastq_to_bam(picard,
                        fastq_one,
                        fastq_two,
                        out_dir,
                        names,
                        order="queryname"):
    """Convert fastq file(s) to BAM, adding sample, run group and platform information.
    """
    out_bam = os.path.join(
        out_dir,
        "%s-fastq.bam" % os.path.splitext(os.path.basename(fastq_one))[0])
    if not file_exists(out_bam):
        with tx_tmpdir(picard._config) as tmp_dir:
            with file_transaction(picard._config, out_bam) as tx_out_bam:
                opts = [("FASTQ", fastq_one), ("READ_GROUP_NAME", names["rg"]),
                        ("SAMPLE_NAME", names["sample"]),
                        ("PLATFORM_UNIT", names["pu"]),
                        ("PLATFORM", names["pl"]), ("TMP_DIR", tmp_dir),
                        ("OUTPUT", tx_out_bam), ("SORT_ORDER", order)]
                if fastq_two:
                    opts.append(("FASTQ2", fastq_two))
                picard.run("FastqToSam", opts)
    return out_bam
Example #37
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} ")
    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file, name_sort=True)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Example #38
0
def _run_snpeff(snp_in, out_format, data):
    snpeff_db, datadir = get_db(data)
    assert datadir is not None, \
        "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"]
    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} "
                "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Example #39
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "VariantFiltration", "-R",
                tz.get_in(["reference", "fasta", "base"],
                          data), "--variant", vcf_file, "--out", tx_out_file,
                "--genotypeFilterName", name, "--genotypeFilterExpression",
                "'%s'" % expression
            ]
            jvm_opts = broad.get_gatk_framework_opts(data["config"])
            cmd = [config_utils.get_program("gatk-framework", data["config"])
                   ] + jvm_opts + params
            do.run(cmd, "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Example #40
0
def _prep_sample_cnvs(cnv_file, data):
    """Convert a multiple sample CNV file into a single BED file for a sample.

    Handles matching and fixing names where R converts numerical IDs (1234) into
    strings by adding an X (X1234), and converts other characters into '.'s.
    http://stat.ethz.ch/R-manual/R-devel/library/base/html/make.names.html
    """
    import pybedtools
    sample_name = tz.get_in(["rgnames", "sample"], data)
    def make_names(name):
        return re.sub("[^\w.]", '.', name)
    def matches_sample_name(feat):
        return (feat.name == sample_name or feat.name == "X%s" % sample_name or
                feat.name == make_names(sample_name))
    def update_sample_name(feat):
        feat.name = sample_name
        return feat
    sample_file = os.path.join(os.path.dirname(cnv_file), "%s-cnv.bed" % sample_name)
    if not utils.file_exists(sample_file):
        with file_transaction(data, sample_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                pybedtools.BedTool(cnv_file).filter(matches_sample_name).each(update_sample_name).saveas(tx_out_file)
    return sample_file
Example #41
0
def _grabix_index(data):
    """Create grabix index of bgzip input file.

    grabix does not allow specification of output file, so symlink the original
    file into a transactional directory.
    """
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = _get_grabix_index(in_file)
    # We always build grabix input so we can use it for counting reads and doing downsampling
    if not gbi_file or _is_partial_index(gbi_file):
        if gbi_file:
            utils.remove_safe(gbi_file)
        else:
            gbi_file = in_file + ".gbi"
        with file_transaction(data, gbi_file) as tx_gbi_file:
            tx_in_file = os.path.splitext(tx_gbi_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            do.run([grabix, "index", tx_in_file],
                   "Index input with grabix: %s" % os.path.basename(in_file))
    assert utils.file_exists(gbi_file)
    return [gbi_file]
Example #42
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file
Example #43
0
def picard_sam_to_bam(picard, align_sam, fastq_bam, ref_file, is_paired=False):
    """Convert SAM to BAM, including unmapped reads from fastq BAM file.
    """
    if align_sam.endswith(".sam"):
        out_bam = "%s.bam" % os.path.splitext(align_sam)[0]
    elif align_sam.endswith("-align.bam"):
        out_bam = "%s.bam" % align_sam.replace("-align.bam", "")
    else:
        raise NotImplementedError("Input format not recognized")
    if not file_exists(out_bam):
        with curdir_tmpdir() as tmp_dir:
            with file_transaction(out_bam) as tx_out_bam:
                opts = [
                    ("UNMAPPED", fastq_bam),
                    ("ALIGNED", align_sam),
                    ("OUTPUT", tx_out_bam),
                    ("REFERENCE_SEQUENCE", ref_file),
                    ("TMP_DIR", tmp_dir),
                    ("PAIRED_RUN", ("true" if is_paired else "false")),
                ]
                picard.run("MergeBamAlignment", opts)

    return out_bam
Example #44
0
def sort_merge(in_file, data, out_dir=None):
    """Sort and merge a BED file, collapsing gene names.
       Output is a 3 or 4 column file (the 4th column values go comma-separated).
    """
    out_file = "%s-sortmerge.bed" % os.path.splitext(in_file)[0]
    bedtools = config_utils.get_program("bedtools", data, default="bedtools")
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        column_opt = ""
        with utils.open_gzipsafe(in_file) as in_handle:
            for line in in_handle:
                if not line.startswith(("#", "track", "browser")):
                    parts = line.split()
                    if len(parts) >= 4:
                        column_opt = "-c 4 -o distinct"
        with file_transaction(data, out_file) as tx_out_file:
            cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
            sort_cmd = get_sort_cmd()
            cmd = ("{cat_cmd} {in_file} | {sort_cmd} -k1,1 -k2,2n | "
                   "{bedtools} merge -i - {column_opt} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sort and merge BED file", data)
    return out_file
Example #45
0
def gatk_indel_realignment(runner,
                           align_bam,
                           ref_file,
                           intervals,
                           region=None,
                           out_file=None,
                           deep_coverage=False,
                           config=None):
    """Perform realignment of BAM file in specified regions
    """
    if out_file is None:
        out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0]
    if not file_exists(out_file):
        with curdir_tmpdir({"config": config}) as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                logger.info("GATK IndelRealigner: %s %s" %
                            (os.path.basename(align_bam), region))
                cl = gatk_indel_realignment_cl(runner, align_bam, ref_file,
                                               intervals, tmp_dir, region,
                                               deep_coverage)
                cl += ["-o", tx_out_file]
                do.run(cl, "GATK indel realignment", {})
    return out_file
Example #46
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    if in_file:
        bedtools = config_utils.get_program("bedtools", data["config"])
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Example #47
0
def pizzly(pizzly_path, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions,
           samplename, data):
    outdir = os.path.join(pizzlydir, samplename)
    out_stem = os.path.join(outdir, samplename)
    pizzly_gtf = make_pizzly_gtf(gtf, os.path.join(pizzlydir, "pizzly.gtf"),
                                 data)
    sentinel = os.path.join(out_stem, "-flat-filtered.tsv")
    pizzlycalls = out_stem + ".json"
    if not file_exists(pizzlycalls):
        with file_transaction(data, outdir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            tx_out_stem = os.path.join(tx_out_dir, samplename)
            cmd = (
                "{pizzly_path} -k 31 --gtf {pizzly_gtf} --cache {cachefile} "
                "--align-score 2 --insert-size {fraglength} --fasta {gtf_fa} "
                "--output {tx_out_stem} {fusions}")
            message = ("Running pizzly on %s." % fusions)
            do.run(cmd.format(**locals()), message)
    flatfile = out_stem + "-flat.tsv"
    filteredfile = out_stem + "-flat-filtered.tsv"
    flatten_pizzly(pizzlycalls, flatfile, data)
    filter_pizzly(flatfile, filteredfile, data)
    return outdir
Example #48
0
def _prepare_inputs(ma_fn, bam_file, out_dir):
    """
    Convert to fastq with counts
    """
    fixed_fa = os.path.join(out_dir, "file_reads.fa")
    count_name =dict()
    with file_transaction(fixed_fa) as out_tx:
        with open(out_tx, 'w') as out_handle:
            with open(ma_fn) as in_handle:
                h = in_handle.next()
                for line in in_handle:
                    cols = line.split("\t")
                    name_with_counts = "%s_x%s" % (cols[0], sum(map(int, cols[2:])))
                    count_name[cols[0]] = name_with_counts
                    print >>out_handle, ">%s\n%s" % (name_with_counts, cols[1])
    fixed_bam = os.path.join(out_dir, "align.bam")
    bam_handle = pysam.AlignmentFile(bam_file, "rb")
    with pysam.AlignmentFile(fixed_bam, "wb", template=bam_handle) as out_handle:
        for read in bam_handle.fetch():
            read.query_name = count_name[read.query_name]
            out_handle.write(read)

    return fixed_fa, fixed_bam
Example #49
0
def _segs_to_vcf(in_file, data):
    """Convert output TitanCNA segs file into bgzipped VCF.
    """
    out_file = "%s.vcf" % utils.splitext_plus(in_file)[0]
    if not utils.file_exists(out_file + ".gz") and not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    out_handle.write(_vcf_header)
                    out_handle.write("\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL",
                                                "FILTER", "INFO", "FORMAT", dd.get_sample_name(data)])
                                     + "\n")
                    header = in_handle.readline().strip().split("\t")
                    for line in in_handle:
                        cur = dict(zip(header, line.strip().split("\t")))
                        svtype = _get_svtype(cur["TITAN_call"])
                        info = ["SVTYPE=%s" % svtype, "END=%s" % cur["End_Position.bp."],
                                "CN=%s" % cur["Copy_Number"], "MajorCN=%s" % cur["MajorCN"],
                                "MinorCN=%s" % cur["MinorCN"], "FOLD_CHANGE_LOG=%s" % cur["Median_logR"]]
                        out = [cur["Chromosome"], cur["Start_Position.bp."], ".", "N", "<%s>" % svtype, ".",
                            ".", ";".join(info), "GT", "0/1"]
                        out_handle.write("\t".join(out) + "\n")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #50
0
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                params = ["-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % cores,
                          "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"],
                          "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)]
            else:
                params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file,
                          "-BQSR", data["prep_recal"], "-o", tx_out_file]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=(cores > 1 and gatk_type == "gatk4"))
    bam.index(out_file, data["config"])
    return out_file
Example #51
0
def _disambiguate_star_fusion_junctions(star_junction_file, contamination_bam, disambig_out_file, data):
    """ Disambiguate detected fusions based on alignments to another species.
    """
    out_file = disambig_out_file
    fusiondict = {}
    with open(star_junction_file, "r") as in_handle:
        for my_line in in_handle:
            my_line_split = my_line.strip().split("\t")
            if len(my_line_split) < 10:
                continue
            fusiondict[my_line_split[9]] = my_line.strip("\n")
    with pysam.Samfile(contamination_bam, "rb") as samfile:
        for my_read in samfile:
            if my_read.is_unmapped or my_read.is_secondary:
                continue
            if my_read.qname in fusiondict:
                fusiondict.pop(my_read.qname)
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, 'w') as myhandle:
            for my_key in fusiondict:
                print(fusiondict[my_key], file=myhandle)

    return out_file
Example #52
0
def run(bam_file, data, out_dir):
    out = {}
    if not tz.get_in(["config", "algorithm", "preseq"], data):
        return out

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools")
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        preseq = config_utils.get_program("preseq", data["config"])
        params = _get_preseq_params(data, int(samtools_stats["Total_reads"]))
        param_line = "-step {step} -extrap {extrap} -seg_len {seg_len}".format(
            **params)
        with file_transaction(data, stats_file) as tx_out_file:
            cmd = "{preseq} lc_extrap -bam -pe {bam_file} -o {tx_out_file} {param_line}".format(
                **locals())
            do.run(cmd.format(**locals()), "preseq lc_extrap", data)

    out = _prep_real_counts(bam_file, data, samtools_stats)

    return {"base": stats_file, "metrics": out}
Example #53
0
def _merge_metrics(yaml_data):
    """
    parse project.yaml file to get metrics for each bam
    """
    project = yaml_data
    out_file = os.path.join("metrics", "metrics.tsv")
    dt_together = []
    with file_transaction(out_file) as out_tx:
        for s in project['samples']:
            m = s['summary']['metrics']
            for me in m:
                if isinstance(m[me], list):
                    m[me] = ":".join(m[me])
            dt = pd.DataFrame(m, index=['1'])
            # dt = pd.DataFrame.from_dict(m)
            dt.columns = [
                k.replace(" ", "_").replace("(", "").replace(")", "")
                for k in dt.columns
            ]
            dt['sample'] = s['description']
            dt_together.append(dt)
        dt_together = utils.rbind(dt_together)
        dt_together.to_csv(out_tx, index=False, sep="\t")
Example #54
0
def _cram_to_fastq_region(cram_file, work_dir, base_name, region, data):
    """Convert CRAM to fastq in a specified region.
    """
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
    out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                          (base_name, rext, fext))
                             for fext in ["s1", "p1", "p2"]]
    if not utils.file_exists(out_p1):
        with file_transaction(data, out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2):
            cram_file = utils.remote_cl_input(cram_file)
            sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
            cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                   "gz=1 collate=1 colsbs={max_mem} "
                   "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null "
                   "reference={ref_file}")
            if region:
                cmd += " ranges='{region}'"
            do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
    return [[out_p1, out_p2, out_s]]
Example #55
0
def regions_coverage(data, bed_file, bam_file, target_name, depth_thresholds=None):
    """Generate coverage over regions of interest using sambamba depth.

    sambamba can segfault with multiple threads so provides a single threaded backup
    implementation in case of failures.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, target_name + "_regions_depth.bed")
    if utils.file_uptodate(out_file, bam_file) and utils.file_uptodate(out_file, bed_file):
        return out_file
    with file_transaction(data, out_file) as tx_out_file:
        try:
            cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds)
            cmdl += " -o " + tx_out_file
            message = "Calculating regions coverage of {target_name} in {bam_file}"
            do.run(cmdl, message.format(**locals()))
        except subprocess.CalledProcessError:
            cmdl = sambamba.make_command(data, "depth region", bam_file, bed_file, depth_thresholds=depth_thresholds,
                                         multicore=False)
            cmdl += " -o " + tx_out_file
            message = "Calculating regions coverage of {target_name} in {bam_file} -- single thread backup"
            do.run(cmdl, message.format(**locals()))
    return out_file
Example #56
0
def cnvkit_background(background_cnns,
                      out_file,
                      items,
                      target_bed=None,
                      antitarget_bed=None):
    """Calculate background reference, handling flat case with no normal sample.
    """
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cmd = [
                _get_cmd(), "reference", "-f",
                dd.get_ref_file(items[0]), "-o", tx_out_file
            ]
            gender = _get_batch_gender(items)
            if gender:
                cmd += ["--sample-sex", gender]
            if len(background_cnns) == 0:
                assert target_bed and antitarget_bed, "Missing CNNs and target BEDs for flat background"
                cmd += ["-t", target_bed, "-a", antitarget_bed]
            else:
                cmd += background_cnns
            do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background")
    return out_file
Example #57
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts = _get_fgbio_options(data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=unsorted "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Example #58
0
def coverage_region_detailed_stats(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return None
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file,
                               cleaned_bed) and utils.file_uptodate(
                                   parse_file, in_bam):
            pass
        else:
            with file_transaction(parse_file) as out_tx:
                cmdl = sambamba.make_command(
                    data,
                    "depth region",
                    in_bam,
                    cleaned_bed,
                    depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100],
                    max_cov=1000)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl,
                       "Run coverage regional analysis for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file),
                                            sample)
    return os.path.abspath(parse_file)
Example #59
0
def _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data):
    """Retrieve intervals to run validation on, merging reference and callable BED files.
    """
    a_intervals = get_analysis_intervals(data, vrn_file, base_dir)
    if a_intervals:
        final_intervals = shared.remove_lcr_regions(a_intervals, [data])
        if rm_interval_file:
            caller = _get_caller(data)
            sample = dd.get_sample_name(data)
            combo_intervals = os.path.join(base_dir, "%s-%s-%s-wrm.bed" %
                                           (utils.splitext_plus(os.path.basename(final_intervals))[0],
                                            sample, caller))
            if not utils.file_uptodate(combo_intervals, final_intervals):
                with file_transaction(data, combo_intervals) as tx_out_file:
                    with utils.chdir(os.path.dirname(tx_out_file)):
                        # Copy files locally to avoid issues on shared filesystems
                        # where BEDtools has trouble accessing the same base
                        # files from multiple locations
                        a = os.path.basename(final_intervals)
                        b = os.path.basename(rm_interval_file)
                        try:
                            shutil.copyfile(final_intervals, a)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(final_intervals, a)
                        try:
                            shutil.copyfile(rm_interval_file, b)
                        except IOError:
                            time.sleep(60)
                            shutil.copyfile(rm_interval_file, b)
                        cmd = ("bedtools intersect -nonamecheck -a {a} -b {b} > {tx_out_file}")
                        do.run(cmd.format(**locals()), "Intersect callable intervals for rtg vcfeval")
            final_intervals = combo_intervals
    else:
        assert rm_interval_file, "No intervals to subset analysis with for %s" % vrn_file
        final_intervals = shared.remove_lcr_regions(rm_interval_file, [data])
    return final_intervals
Example #60
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion.

    Excludes high depth and centromere regions which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0],
                                     "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(
                tz.get_in(["reference", "fasta", "base"], items[0]),
                items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(
                    shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom,
                                               items[0]))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(
                    sv_exclude_bed, nonamecheck=True).saveas()
            if any(dd.get_coverage_interval(d) == "genome" for d in items):
                want_bedtool = pybedtools.BedTool(
                    shared.remove_highdepth_regions(want_bedtool.saveas().fn,
                                                    items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(
                    tz.get_in(["reference", "fasta", "base"], items[0]),
                    items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool,
                                          nonamecheck=True).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file