Beispiel #1
0
def get_multisample_vcf(fnames, name, caller, data):
    """Retrieve a multiple sample VCF file in a standard location.

    Handles inputs with multiple repeated input files from batches.
    """
    unique_fnames = []
    for f in fnames:
        if f not in unique_fnames:
            unique_fnames.append(f)
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    if len(unique_fnames) > 1:
        gemini_vcf = os.path.join(out_dir, "%s-%s.vcf.gz" % (name, caller))
        vrn_file_batch = None
        for variant in data.get("variants", []):
            if variant["variantcaller"] == caller and variant.get("vrn_file_batch"):
                vrn_file_batch = variant["vrn_file_batch"]
        if vrn_file_batch:
            utils.symlink_plus(vrn_file_batch, gemini_vcf)
            return gemini_vcf
        else:
            return vcfutils.merge_variant_files(unique_fnames, gemini_vcf, dd.get_ref_file(data),
                                                data["config"])
    else:
        gemini_vcf = os.path.join(out_dir, "%s-%s%s" % (name, caller, utils.splitext_plus(unique_fnames[0])[1]))
        utils.symlink_plus(unique_fnames[0], gemini_vcf)
        return gemini_vcf
Beispiel #2
0
def _bedpe_to_vcf(bedpe_file, sconfig_file, items):
    """Convert BEDPE output into a VCF file.
    """
    tovcf_script = do.find_cmd("bedpeToVcf")
    if tovcf_script:
        out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0]
        out_nogzip = out_file.replace(".vcf.gz", ".vcf")
        raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0]
        if not utils.file_exists(out_file):
            if not utils.file_exists(raw_file):
                with file_transaction(raw_file) as tx_raw_file:
                    ref_file = tz.get_in(["reference", "fasta", "base"], items[0])
                    cmd = [
                        sys.executable,
                        tovcf_script,
                        "-c",
                        sconfig_file,
                        "-f",
                        ref_file,
                        "-b",
                        bedpe_file,
                        "-o",
                        tx_raw_file,
                    ]
                    do.run(cmd, "Convert lumpy bedpe output to VCF")
            prep_file = vcfutils.sort_by_ref(raw_file, items[0])
            if not utils.file_exists(out_nogzip):
                utils.symlink_plus(prep_file, out_nogzip)
        out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"])
        return out_file
Beispiel #3
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(bam_file_ready, ref_file, data)
        sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed,
                           "sample_callable": sample_callable,
                           "offtarget_stats": offtarget_stats}
        data = coverage.assign_interval(data)
        highdepth_bed = highdepth.identify(data)
        data["regions"]["highdepth"] = highdepth_bed
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Beispiel #4
0
def _amber_het_file(method, vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    Two available methods:
      - pon -- Use panel of normals with likely heterozygous sites.
      - variants -- Use pre-existing variant calls, filtered to likely heterozygotes.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for PURPLE inputs"
    from bcbio.heterogeneity import bubbletree

    if method == "variants":
        amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
        out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
        prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"], vrn_files[0]["variantcaller"],
                                             work_dir, paired, AmberWriter)
        utils.symlink_plus(prep_file, out_file)
        pcf_file = out_file + ".pcf"
        if not utils.file_exists(pcf_file):
            with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
                r_file = os.path.join(os.path.dirname(tx_out_file), "bafSegmentation.R")
                with open(r_file, "w") as out_handle:
                    out_handle.write(_amber_seg_script)
                cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(), utils.Rscript_cmd(), r_file,
                                                          out_file, pcf_file)
                do.run(cmd, "PURPLE: AMBER baf segmentation")
    else:
        assert method == "pon"
        out_file = _run_amber(paired, work_dir)
    return out_file
Beispiel #5
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data)))
    out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_exists(out_file):
        out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data))
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam))
            cores = dd.get_cores(data)
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = ("samtools view -@ {cores} -h {local_bam} {str_chroms} | "
                   """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                   """cleanbam.fix_header("{ref_file}")' | """
                   "samtools view -@ {cores} -u - | "
                   "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ")
            do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Beispiel #6
0
def cram_compress(in_bam):
    import os
    import subprocess
    from bcbio import utils
    from bcbio.distributed.transaction import file_transaction
    print in_bam
    ref_file = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/seq/GRCh37.fa"
    out_file = "%s.cram" % os.path.splitext(in_bam)[0]
    jvm_opts = "-Xms1g -Xmx3g"
    if not utils.file_exists(out_file):
        print "cramming", out_file
        with file_transaction(out_file) as tx_out_file:
            cmd = ("cramtools {jvm_opts} cram "
                   "--input-bam-file {in_bam} "
                   "--capture-all-tags "
                   "--ignore-tags 'BD:BI' "
                   "--reference-fasta-file {ref_file} "
                   "--lossy-quality-score-spec '*8' "
                   "--output-cram-file {tx_out_file}")
            subprocess.check_call(cmd.format(**locals()), shell=True)
    if not utils.file_exists(out_file + ".crai"):
        print "indexing", out_file + ".crai"
        with file_transaction(out_file + ".crai") as tx_out_file:
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(out_file, tx_in_file)
            cmd = ("cramtools {jvm_opts} index "
                   "--input-file {tx_in_file}")
            subprocess.check_call(cmd.format(**locals()), shell=True)
    if os.path.exists(in_bam) and utils.file_exists(out_file):
        if in_bam != out_file and in_bam.endswith(".bam") and out_file.endswith(".cram"):
            os.remove(in_bam)
    return out_file
Beispiel #7
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        assert sambamba, "Did not find sambamba for indexing"
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            try:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
                do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam))
            except subprocess.CalledProcessError:
                cmd = "{samtools} index {in_bam} {tx_index_file}"
                do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s"
                       % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Beispiel #8
0
def link_bam_file(orig_file, new_dir):
    """Provide symlinks of BAM file and existing indexes.
    """
    new_dir = utils.safe_makedir(new_dir)
    sym_file = os.path.join(new_dir, os.path.basename(orig_file))
    utils.symlink_plus(orig_file, sym_file)
    return sym_file
Beispiel #9
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    requests.packages.urllib3.disable_warnings()
    last_mod = urllib.urlopen(url).info().getheader("Last-Modified")
    last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc())
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(db):
            is_new_version = True
        else:
            cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0]
            cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file))
            is_new_version = last_mod.date() > cur_version.date()
            if is_new_version:
                shutil.move(cur_file, cur_file.replace("minikraken", "old"))
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if is_new_version:
            if not os.path.exists(compress):
                subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            last_version = glob.glob(os.path.join(kraken, "minikraken_*"))
            utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
        else:
            print "You have the latest version %s." % last_mod
    else:
        raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
Beispiel #10
0
def variantcall_sample(data, region=None, align_bams=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file):
        utils.safe_makedir(os.path.dirname(out_file))
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fns = get_variantcallers()
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        if len(align_bams) == 1:
            items = [data]
        else:
            items = multi.get_orig_items(data)
            assert len(items) == len(align_bams)
        assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
        if not assoc_files: assoc_files = {}
        for bam_file in align_bams:
            bam.index(bam_file, data["config"], check_timestamp=False)
        do_phasing = data["config"]["algorithm"].get("phasing", False)
        call_file = "%s-raw%s" % utils.splitext_plus(out_file) if do_phasing else out_file
        call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file)
        if do_phasing == "gatk":
            call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
            utils.symlink_plus(call_file, out_file)
    if region:
        data["region"] = region
    data["vrn_file"] = out_file
    return [data]
Beispiel #11
0
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam) and
          not utils.file_uptodate(alt_index_file, in_bam)):
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
            else:
                cmd = "{samtools} index {tx_bam_file}"
            do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
Beispiel #12
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Beispiel #13
0
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam) and
          not utils.file_uptodate(alt_index_file, in_bam)):
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {in_bam} {tx_index_file}"
            else:
                assert tx_index_file.find(".bam.bai") > 0
                tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
                utils.symlink_plus(in_bam, tx_bam_file)
                cmd = "{samtools} index {tx_bam_file}"
            # sambamba has intermittent multicore failures. Allow
            # retries with single core
            try:
                do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam),
                       log_error=False)
            except:
                do.run(samtools_cmd.format(**locals()),
                       "Index BAM file (single core): %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
Beispiel #14
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Beispiel #15
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(data["group_orig"]):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                if "combine" in data:
                    sub_data["combine"] = data["combine"]
                sub_data["vrn_file"] = data["vrn_file"]
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in data["group_orig"]:
            sub_vrn_file = data["vrn_file"].replace(data["group"][0] + "-", sub_data["name"][-1] + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], sub_data["name"][-1], sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            if "combine" in data:
                sub_data["combine"] = data["combine"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Beispiel #16
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #17
0
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple)) and
              not all(has_aligned_reads(x, region) for x in align_bams)):
                vcfutils.write_empty_vcf(out_file)
                return
        with file_transaction(out_file_mutect) as tx_out_file:
            # Rationale: MuTect writes another table to stdout, which we don't need
            params += ["--vcf", tx_out_file, "-o", os.devnull]
            broad_runner.run_mutect(params)
        _rename_allelic_fraction_field(out_file_mutect,config)
        disable_SID = True # SID isn't great, so use Scalpel instead
        if "appistry" not in broad_runner.get_mutect_version() or disable_SID:
            # Scalpel InDels
            is_paired = "-I:normal" in params
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                with file_transaction(out_file_indels) as tx_out_file2:
                    if not is_paired:
                        scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                    else:
                        scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                                                    region=region, out_file=tx_out_file2)
                out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                          out_file=out_file,
                                                          ref_file=items[0]["sam_ref"],
                                                          config=items[0]["config"],
                                                          region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        else:
            # SomaticIndelDetector modifications
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file + "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files,
                                           region, out_file_indels)
            with file_transaction(out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels],
                                                      out_file=out_file,
                                                      ref_file=items[0]["sam_ref"],
                                                      config=items[0]["config"],
                                                      region=region)
    return out_file
Beispiel #18
0
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    safe_makedir(os.path.dirname(out_file))
    sam_ref = data["sam_ref"]
    config = data["config"]
    caller_fns = get_variantcallers()
    caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
    if isinstance(data["work_bam"], basestring):
        align_bams = [data["work_bam"]]
        items = [data]
    else:
        align_bams = data["work_bam"]
        items = data["work_items"]
    call_file = "%s-raw%s" % os.path.splitext(out_file)
    call_file = caller_fn(align_bams, items, sam_ref,
                          data["genome_resources"]["variation"],
                          region, call_file)
    if data["config"]["algorithm"].get("phasing", False) == "gatk":
        call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
    utils.symlink_plus(call_file, out_file)
    if "work_items" in data:
        del data["work_items"]
    data["vrn_file"] = out_file
    return [data]
Beispiel #19
0
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
              rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None):
    """Normalizes variants and reruns SnpEFF for resulting VCF
    """
    if remove_oldeffects:
        out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
    else:
        out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if work_dir:
        out_file = os.path.join(work_dir, os.path.basename(out_file))
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _normalize(in_file, data, passonly=passonly,
                                       normalize_indels=normalize_indels,
                                       split_biallelic=split_biallelic,
                                       remove_oldeffects=remove_oldeffects,
                                       nonrefonly=nonrefonly,
                                       work_dir=work_dir)
            if rerun_effects:
                ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
                if ann_ma_file:
                    ready_ma_file = ann_ma_file
            utils.symlink_plus(ready_ma_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #20
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # joint calling, do not split back up due to potentially large sample sizes
    elif tz.get_in(("config", "algorithm", "jointcaller"), data):
        return [[data]]
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(str(data["group"][0]) + "-", str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"], str(sub_data["name"][-1]), sub_vrn_file, data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Beispiel #21
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    if not file_exists(final_out):
        symlink_plus(out_file, final_out)
    return final_out
def _get_samples_to_process(fn, out_dir, config, force_single):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    out_dir = os.path.abspath(out_dir)
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            cols = l.strip().split(",")
            if len(cols) > 0:
                if len(cols) < 2:
                    raise ValueError("Line needs 2 values: file and name.")
                if utils.file_exists(cols[0]) or is_gsm(cols[0]):
                    if cols[0].find(" ") > -1:
                        new_name = os.path.abspath(cols[0].replace(" ", "_"))
                        logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name))
                        logger.warning("Please, avoid names with spaces in the future.")
                        utils.symlink_plus(os.path.abspath(cols[0]), new_name)
                        cols[0] = new_name
                    samples[cols[1]].append(cols)
                else:
                    logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.items():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        elif is_gsm(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items]
        samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}]
    return [samples[sample] for sample in samples]
Beispiel #23
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed,
                           "callable": callable_bed,
                           "sample_callable": covinfo.callable,
                           "mapped_stats": readstats.get_cache_file(data)}
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Beispiel #24
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #25
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #26
0
def run(items):
    assert len(items) == 1, ("Expect one input to biological prioritization: %s" %
                             ", ".join([dd.get_sample_name(d) for d in items]))
    data = items[0]
    inputs = []
    for call in data.get("sv", []):
        vcf_file = call.get("vcf_file", call.get("vrn_file"))
        if vcf_file and vcf_file.endswith((".vcf", "vcf.gz")):
            pp_fn = POST_PRIOR_FNS.get(call["variantcaller"])
            if pp_fn:
                pp_fn = pp_fn(call)
            inputs.append((call["variantcaller"], vcf_file, pp_fn))
    if len(inputs) > 0:
        prioritize_by = tz.get_in(["config", "algorithm", "svprioritize"], data)
        if prioritize_by:
            work_dir = _sv_workdir(data)
            priority_files = [_prioritize_vcf(vcaller, vfile, prioritize_by, post_prior_fn, work_dir, data)
                              for vcaller, vfile, post_prior_fn in inputs]
            priority_tsv = _combine_files([xs[0] for xs in priority_files], work_dir, data)
            raw_files = {}
            for svcaller, fname in zip([xs[0] for xs in inputs], [xs[1] for xs in priority_files]):
                clean_fname = os.path.join(os.path.dirname(fname), "%s-%s-prioritized%s" %
                                           (dd.get_sample_name(data), svcaller, utils.splitext_plus(fname)[-1]))
                utils.symlink_plus(fname, clean_fname)
                raw_files[svcaller] = clean_fname
            data["sv"].append({"variantcaller": "sv-prioritize", "vrn_file": priority_tsv,
                               "raw_files": raw_files})
    # Disabled on move to CWL, not used and tested with CNVkit changes
    # data = _cnv_prioritize(data)
    return [data]
Beispiel #27
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #28
0
def index_cram(fname):
    out_file = "%s.crai" % fname
    if not utils.file_exists(out_file):
        print "Indexing", fname
        with file_transaction(out_file) as tx_out_file:
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(fname, tx_in_file)
            subprocess.check_call(["cram_index", tx_in_file])
    return out_file
def create_tumor_bamdir(tumor, tumor_bam, normal_bam, work_dir):
    """Create expected input directory with tumor/normal BAMs in one place.
    """
    bam_dir = utils.safe_makedir(os.path.join(work_dir, tumor, "in_bams"))
    normal_bam_ready = os.path.join(bam_dir, os.path.basename(normal_bam))
    utils.symlink_plus(normal_bam, normal_bam_ready)
    tumor_bam_ready = os.path.join(bam_dir, os.path.basename(tumor_bam))
    utils.symlink_plus(tumor_bam, tumor_bam_ready)
    return bam_dir, normal_bam_ready
Beispiel #30
0
def index(in_cram, config):
    """Ensure CRAM file has a .crai index file using cram_index from scramble.
    """
    if not utils.file_exists(in_cram + ".crai"):
        with file_transaction(config, in_cram + ".crai") as tx_out_file:
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(in_cram, tx_in_file)
            cmd = "cram_index {tx_in_file}"
            subprocess.check_call(cmd.format(**locals()), shell=True)
Beispiel #31
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    requests.packages.urllib3.disable_warnings()
    last_mod = urllib.request.urlopen(url).info().getheader('Last-Modified')
    last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc())
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(db):
            is_new_version = True
        else:
            cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0]
            cur_version = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(cur_file))
            is_new_version = last_mod.date() > cur_version.date()
            if is_new_version:
                shutil.move(cur_file, cur_file.replace('minikraken', 'old'))
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if is_new_version:
            if not os.path.exists(compress):
                subprocess.check_call(
                    ["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            last_version = glob.glob(os.path.join(kraken, "minikraken_*"))
            utils.symlink_plus(os.path.join(kraken, last_version[0]),
                               os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
        else:
            print("You have the latest version %s." % last_mod)
    else:
        raise argparse.ArgumentTypeError(
            "kraken not installed in tooldir %s." %
            os.path.join(tooldir, "bin", "kraken"))
Beispiel #32
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    ref_file = dd.get_ref_file(data)
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": covinfo.raw_callable,
            "sample_callable": covinfo.callable,
            "mapped_stats": readstats.get_cache_file(data)
        }
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        data = samtools.run_and_save(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    elif dd.get_variant_regions(data):
        callable_region_bed, nblock_bed = \
            callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": dd.get_variant_regions(data),
            "sample_callable": dd.get_variant_regions(data)
        }
    return [[data]]
Beispiel #33
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(
            index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(
            alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        assert sambamba, "Did not find sambamba for indexing"
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            try:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
                do.run(
                    cmd.format(**locals()),
                    "Index BAM file with sambamba: %s" %
                    os.path.basename(in_bam))
            except subprocess.CalledProcessError:
                cmd = "{samtools} index {in_bam} {tx_index_file}"
                do.run(
                    cmd.format(**locals()),
                    "Backup single thread index of BAM file with samtools: %s"
                    % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Beispiel #34
0
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
              rerun_effects=True, remove_oldeffects=False):
    """Normalizes variants and reruns SnpEFF for resulting VCF
    """
    if remove_oldeffects:
        out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
    else:
        out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _normalize(in_file, data, passonly=passonly,
                                       normalize_indels=normalize_indels,
                                       split_biallelic=split_biallelic,
                                       remove_oldeffects=remove_oldeffects)
            if rerun_effects:
                ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
                if ann_ma_file:
                    ready_ma_file = ann_ma_file
            utils.symlink_plus(ready_ma_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Beispiel #35
0
def _enforce_max_region_size(in_file, data):
    """Ensure we don't have any chunks in the region greater than 1Mb.

    Larger sections have high memory usage on VarDictJava and failures
    on VarDict. This creates minimum windows from the input BED file
    to avoid these issues. Downstream VarDict merging sorts out any
    variants across windows.
    """
    max_size = 1e6
    overlap_size = 250
    def _has_larger_regions(f):
        return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f))
    out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if _has_larger_regions(in_file):
            with file_transaction(data, out_file) as tx_out_file:
                pybedtools.BedTool().window_maker(w=max_size,
                                                  s=max_size - overlap_size,
                                                  b=pybedtools.BedTool(in_file)).saveas(tx_out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #36
0
def _grabix_index(data):
    """Create grabix index of bgzip input file.

    grabix does not allow specification of output file, so symlink the original
    file into a transactional directory.
    """
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = _get_grabix_index(in_file)
    # We always build grabix input so we can use it for counting reads and doing downsampling
    if not gbi_file or _is_partial_index(gbi_file):
        if gbi_file:
            utils.remove_safe(gbi_file)
        else:
            gbi_file = in_file + ".gbi"
        with file_transaction(data, gbi_file) as tx_gbi_file:
            tx_in_file = os.path.splitext(tx_gbi_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file))
    assert utils.file_exists(gbi_file)
    return [gbi_file]
Beispiel #37
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    params = {
        "min_coverage_for_downsampling": 10,
        "max_downsample_multiplier": 200
    }
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "sample_callable": covinfo.callable,
            "coverage_depth_bed": covinfo.depth
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
Beispiel #38
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(
            ".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.callable, bam_file_ready, ref_file, data)
        vrs_file = dd.get_variant_regions_merged(data)
        offtarget_stats = callable.calculate_offtarget_stats(
            bam_file_ready, data, vrs_file, "variant_regions")
        data["regions"] = {
            "nblock": nblock_bed,
            "callable": callable_bed,
            "highdepth": covinfo.highdepth,
            "sample_callable": covinfo.callable,
            "coverage_bed": covinfo.coverage,
            "avg_coverage": covinfo.avg_coverage,
            "offtarget_stats": offtarget_stats
        }
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Beispiel #39
0
def _bedpe_to_vcf(bedpe_file, sconfig_file, items):
    """Convert BEDPE output into a VCF file.
    """
    tovcf_script = do.find_cmd("bedpeToVcf")
    if tovcf_script:
        out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0]
        out_nogzip = out_file.replace(".vcf.gz", ".vcf")
        raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0]
        if not utils.file_exists(out_file):
            if not utils.file_exists(raw_file):
                with file_transaction(items[0], raw_file) as tx_raw_file:
                    cmd = [
                        sys.executable, tovcf_script, "-c", sconfig_file, "-f",
                        dd.get_ref_file(items[0]), "-t", "LUMPY", "-b",
                        bedpe_file, "-o", tx_raw_file
                    ]
                    do.run(cmd, "Convert lumpy bedpe output to VCF")
            prep_file = vcfutils.sort_by_ref(raw_file, items[0])
            if not utils.file_exists(out_nogzip):
                utils.symlink_plus(prep_file, out_nogzip)
        out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"])
        return out_file
Beispiel #40
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(
        lambda x: x is not None,
        list(
            set([
                tz.get_in(["config", "algorithm", "highdepth_regions"], x)
                for x in items
            ])))
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() +
                                             "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #41
0
def remove_extracontigs(in_bam, data):
    """Remove extra contigs (non chr1-22,X,Y) from an input BAM.

    These extra contigs can often be arranged in different ways, causing
    incompatibility issues with GATK and other tools. This also fixes the
    read group header as in fixrg.

    This does not yet handle mapping over 1 -> chr1 issues since this requires
    a ton of search/replace which slows down conversion.
    """
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "bamclean",
                     dd.get_sample_name(data)))
    out_file = os.path.join(
        work_dir,
        "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0])
    if not utils.file_uptodate(out_file, in_bam):
        with file_transaction(data, out_file) as tx_out_file:
            target_chroms = _target_chroms_and_header(in_bam, data)
            str_chroms = " ".join(target_chroms)
            rg_info = novoalign.get_rg_info(data["rgnames"])
            bcbio_py = sys.executable
            ref_file = dd.get_ref_file(data)
            local_bam = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(in_bam))
            utils.symlink_plus(in_bam, local_bam)
            bam.index(local_bam, data["config"])
            cmd = (
                "samtools view -h {local_bam} {str_chroms} | "
                """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """
                """cleanbam.fix_header("{ref_file}")' | """
                "samtools view -u - | "
                "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - "
            )
            do.run(
                cmd.format(**locals()),
                "bamprep, remove extra contigs: %s" % dd.get_sample_name(data))
    return out_file
Beispiel #42
0
def _prep_genome(out_dir, data):
    """Create prepped reference directory for pisces.

    Requires a custom GenomeSize.xml file present.
    """
    genome_name = utils.splitext_plus(os.path.basename(
        dd.get_ref_file(data)))[0]
    out_dir = utils.safe_makedir(os.path.join(out_dir, genome_name))
    ref_file = dd.get_ref_file(data)
    utils.symlink_plus(ref_file,
                       os.path.join(out_dir, os.path.basename(ref_file)))
    with open(os.path.join(out_dir, "GenomeSize.xml"), "w") as out_handle:
        out_handle.write('<sequenceSizes genomeName="%s">' % genome_name)
        for c in pysam.AlignmentFile(
                "%s.dict" % utils.splitext_plus(ref_file)[0]).header["SQ"]:
            cur_ploidy = ploidy.get_ploidy([data], region=[c["SN"]])
            out_handle.write(
                '<chromosome fileName="%s" contigName="%s" totalBases="%s" knownBases="%s" '
                'isCircular="false" ploidy="%s" md5="%s"/>' %
                (os.path.basename(ref_file), c["SN"], c["LN"], c["LN"],
                 cur_ploidy, c["M5"]))
        out_handle.write('</sequenceSizes>')
    return out_dir
Beispiel #43
0
def _trna_annotation(data):
    """
    use tDRmapper to quantify tRNAs
    """
    trna_ref = op.join(dd.get_srna_trna_file(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna", name))
    in_file = op.basename(data["clean_fastq"])
    tdrmapper = os.path.join(os.path.dirname(sys.executable), "TdrMappingScripts.pl")
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_ref) or not file_exists(tdrmapper):
        logger.info("There is no tRNA annotation to run TdrMapper.")
        return work_dir
    out_file = op.join(work_dir, in_file + ".hq_cs.mapped")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*mapped*"):
                    shutil.move(filename, work_dir)
    return work_dir
Beispiel #44
0
def _amber_het_file(method, vrn_files, work_dir, paired):
    """Create file of BAFs in normal heterozygous positions compatible with AMBER.

    Two available methods:
      - pon -- Use panel of normals with likely heterozygous sites.
      - variants -- Use pre-existing variant calls, filtered to likely heterozygotes.

    https://github.com/hartwigmedical/hmftools/tree/master/amber
    https://github.com/hartwigmedical/hmftools/blob/637e3db1a1a995f4daefe2d0a1511a5bdadbeb05/hmf-common/src/test/resources/amber/new.amber.baf
    """
    assert vrn_files, "Did not find compatible variant calling files for TitanCNA inputs"
    from bcbio.heterogeneity import bubbletree

    if method == "variants":
        amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
        out_file = os.path.join(
            amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
        prep_file = bubbletree.prep_vrn_file(vrn_files[0]["vrn_file"],
                                             vrn_files[0]["variantcaller"],
                                             work_dir, paired, AmberWriter)
        utils.symlink_plus(prep_file, out_file)
    else:
        assert method == "pon"
        tumor_counts, normal_counts = gatkcnv.heterogzygote_counts(paired)
        out_file = _count_files_to_amber(tumor_counts, normal_counts, work_dir,
                                         paired.tumor_data)
    pcf_file = out_file + ".pcf"
    if not utils.file_exists(pcf_file):
        with file_transaction(paired.tumor_data, pcf_file) as tx_out_file:
            r_file = os.path.join(os.path.dirname(tx_out_file),
                                  "bafSegmentation.R")
            with open(r_file, "w") as out_handle:
                out_handle.write(_amber_seg_script)
            cmd = "%s && %s --no-environ %s %s %s" % (utils.get_R_exports(
            ), utils.Rscript_cmd(), r_file, out_file, pcf_file)
            do.run(cmd, "PURPLE: AMBER baf segmentation")
    return out_file
Beispiel #45
0
def _add_wham_classification(in_file, items):
    """Run WHAM classifier to assign a structural variant type to each call.
    """
    wham_sharedir = os.path.normpath(
        os.path.join(
            os.path.dirname(subprocess.check_output(["which", "WHAM-BAM"])),
            os.pardir, "share", "wham"))
    minclassfreq = 0.4  # Need at least this much support to classify a variant, otherwise unknown
    out_file = "%s-class%s" % utils.splitext_plus(in_file)
    training_data = os.path.join(wham_sharedir, "WHAM_training_data.txt")
    training_data = _prep_training_data(training_data, out_file, items[0])
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                this_python = sys.executable
                cores = dd.get_cores(items[0])
                cmd = (
                    "{this_python} {wham_sharedir}/classify_WHAM_vcf.py --proc {cores} "
                    "--minclassfreq {minclassfreq} {in_file} {training_data} > {tx_out_file}"
                )
                do.run(cmd.format(**locals()), "Classify WHAM calls")
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file):
        utils.safe_makedir(os.path.dirname(out_file))
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fns = get_variantcallers()
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        if isinstance(data["work_bam"], basestring):
            align_bams = [data["work_bam"]]
            items = [data]
        else:
            align_bams = data["work_bam"]
            items = data["group_orig"]
        call_file = "%s-raw%s" % utils.splitext_plus(out_file)
        call_file = caller_fn(align_bams, items, sam_ref,
                              data["genome_resources"]["variation"],
                              region, call_file)
        if data["config"]["algorithm"].get("phasing", False) == "gatk":
            call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
        utils.symlink_plus(call_file, out_file)
    data["vrn_file"] = out_file
    return [data]
Beispiel #47
0
def split_variants_by_sample(data):
    """Split a multi-sample call file into inputs for individual samples.

    For tumor/normal paired analyses, do not split the final file and attach
    it to the tumor input.
    """
    # not split, do nothing
    if "group_orig" not in data:
        return [[data]]
    # cancer tumor/normal
    elif vcfutils.get_paired_phenotype(data):
        out = []
        for i, sub_data in enumerate(get_orig_items(data)):
            if vcfutils.get_paired_phenotype(sub_data) == "tumor":
                sub_data["vrn_file"] = data["vrn_file"]
            else:
                sub_data.pop("vrn_file", None)
            out.append([sub_data])
        return out
    # population or single sample
    else:
        out = []
        for sub_data in get_orig_items(data):
            sub_vrn_file = data["vrn_file"].replace(
                str(data["group"][0]) + "-",
                str(sub_data["name"][-1]) + "-")
            if len(vcfutils.get_samples(data["vrn_file"])) > 1:
                vcfutils.select_sample(data["vrn_file"],
                                       str(sub_data["name"][-1]), sub_vrn_file,
                                       data["config"])
            elif not os.path.exists(sub_vrn_file):
                utils.symlink_plus(data["vrn_file"], sub_vrn_file)
            sub_data["vrn_file_batch"] = data["vrn_file"]
            sub_data["vrn_file"] = sub_vrn_file
            out.append([sub_data])
        return out
Beispiel #48
0
def _enforce_max_region_size(in_file, data):
    """Ensure we don't have any chunks in the region greater than 20kb.

    VarDict memory usage depends on size of individual windows in the input
    file. This breaks regions into 20kb chunks with 250bp overlaps. 20kb gives
    ~1Gb/core memory usage and the overlaps avoid missing indels spanning a
    gap. Downstream VarDict merging sorts out any variants across windows.

    https://github.com/AstraZeneca-NGS/VarDictJava/issues/64
    """
    max_size = 20000
    overlap_size = 250
    def _has_larger_regions(f):
        return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f))
    out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if _has_larger_regions(in_file):
            with file_transaction(data, out_file) as tx_out_file:
                pybedtools.BedTool().window_maker(w=max_size,
                                                  s=max_size - overlap_size,
                                                  b=pybedtools.BedTool(in_file)).saveas(tx_out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #49
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    adapter = dd.get_adapters(data)
    if trim_reads and adapter:
        adapter = adapter[0]
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
    else:
        logger.debug("Skip trimming for: %s" % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #50
0
def _run_ensemble(batch_id, vrn_files, config_file, base_dir, ref_file, data):
    """Run an ensemble call using merging and SVM-based approach in bcbio.variation
    """
    out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
    out_bed_file = os.path.join(base_dir,
                                "{0}-callregions.bed".format(batch_id))
    work_dir = "%s-work" % os.path.splitext(out_vcf_file)[0]
    if not utils.file_exists(out_vcf_file):
        _bcbio_variation_ensemble(vrn_files, out_vcf_file, ref_file,
                                  config_file, base_dir, data)
        if not utils.file_exists(out_vcf_file):
            base_vcf = glob.glob(
                os.path.join(work_dir, "prep", "*-cfilter.vcf"))[0]
            utils.symlink_plus(base_vcf, out_vcf_file)
    if not utils.file_exists(out_bed_file):
        multi_beds = glob.glob(
            os.path.join(work_dir, "prep", "*-multicombine.bed"))
        if len(multi_beds) > 0:
            utils.symlink_plus(multi_beds[0], out_bed_file)
    return {
        "variantcaller": "ensemble",
        "vrn_file": out_vcf_file,
        "bed_file": out_bed_file if os.path.exists(out_bed_file) else None
    }
Beispiel #51
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    if in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert:
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        utils.symlink_plus(in_file, out_file)
    return out_file
Beispiel #52
0
def _prepare_smoove_bams(full_bams, sr_bams, disc_bams, items, tx_work_dir):
    """Prepare BAMs for smoove, linking in pre-existing split/disc BAMs if present.

    Smoove can use pre-existing discordant and split BAMs prepared by samblaster
    if present as $sample.split.bam and $sample.disc.bam.
    """
    input_dir = utils.safe_makedir(tx_work_dir)
    out = []
    for full, sr, disc, data in zip(full_bams, sr_bams, disc_bams, items):
        if sr and disc:
            new_full = os.path.join(input_dir, "%s.bam" % dd.get_sample_name(data))
            new_sr = os.path.join(input_dir, "%s.split.bam" % dd.get_sample_name(data))
            new_disc = os.path.join(input_dir, "%s.disc.bam" % dd.get_sample_name(data))
            utils.symlink_plus(full, new_full)
            utils.symlink_plus(sr, new_sr)
            utils.symlink_plus(disc, new_disc)
            out.append(new_full)
        else:
            out.append(full)
    return out
Beispiel #53
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    data = umi_transform(data)
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["files"][0] = out_file
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    is_4n = any([a == "4N" for a in adapter])
    adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)]
    if adapter and not trim_reads:
        trim_reads = True
        logger.info("Adapter is set up in config file, but trim_reads is not true."
                    "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        if any([a for a in adapters if re.compile("^N+$").match(a)]):
            adapter_cmd = "-N %s" % adapter_cmd
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        if options.strip() == "-u 4 -u -4":
            options = ""
            is_4n = "4N"
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if is_4n:
                    options = "-u 4 -u -4"
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "atropos with this parameters %s for %s" %(options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["files"][0] = out_file
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #54
0
def _run_purple(paired, het_file, depth_file, work_dir):
    """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs.

    XXX Need to add output conversion into VCF for standard formats
    """
    purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple"))
    out_file = os.path.join(
        purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = [
                "PURPLE", "-amber",
                os.path.dirname(het_file), "-baf", het_file, "-cobalt",
                os.path.dirname(depth_file), "-gc_profile",
                dd.get_variation_resources(paired.tumor_data)["gc_profile"],
                "-output_dir",
                os.path.dirname(tx_out_file), "-ref_genome", "hg38" if
                dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19",
                "-run_dir", work_dir, "-threads",
                dd.get_num_cores(paired.tumor_data), "-tumor_sample",
                dd.get_sample_name(paired.tumor_data), "-ref_sample",
                dd.get_sample_name(paired.normal_data)
            ]
            # Avoid X11 display errors when writing plots
            cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd])
            do.run(cmd, "PURPLE: purity and ploidy estimation")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(purple_dir, f))
    out_file_export = os.path.join(
        purple_dir,
        "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data)))
    if not utils.file_exists(out_file_export):
        utils.symlink_plus(out_file, out_file_export)
    out = {
        "variantcaller": "purple",
        "call_file": out_file_export,
        "plot": {},
        "metrics": {}
    }
    for name, ext in [("copy_number", "copyNumber"),
                      ("minor_allele", "minor_allele"),
                      ("variant", "variant")]:
        plot_file = os.path.join(
            purple_dir, "plot",
            "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext))
        if os.path.exists(plot_file):
            out["plot"][name] = plot_file
    purity_file = os.path.join(
        purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data))
    with open(purity_file) as in_handle:
        header = in_handle.readline().replace("#", "").split("\t")
        vals = in_handle.readline().split("\t")
        for h, v in zip(header, vals):
            try:
                v = float(v)
            except ValueError:
                pass
            out["metrics"][h] = v
    return out
Beispiel #55
0
def _get_samples_to_process(fn, out_dir, config, force_single, separators):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    out_dir = os.path.abspath(out_dir)
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            if l.find("description") > 0:
                logger.info("Skipping header.")
                continue
            cols = l.strip().split(",")
            if len(cols) > 0:
                if len(cols) < 2:
                    raise ValueError("Line needs 2 values: file and name.")
                if utils.file_exists(cols[0]) or is_gsm(cols[0]) or is_srr(
                        cols[0]):
                    if cols[0].find(" ") > -1:
                        new_name = os.path.abspath(cols[0].replace(" ", "_"))
                        logger.warning("Space finds in %s. Linked to %s." %
                                       (cols[0], new_name))
                        logger.warning(
                            "Please, avoid names with spaces in the future.")
                        utils.symlink_plus(os.path.abspath(cols[0]), new_name)
                        cols[0] = new_name
                    samples[cols[1]].append(cols)
                else:
                    logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.items():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        elif is_gsm(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        elif is_srr(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        files = [
            os.path.abspath(fn_file[0])
            if utils.file_exists(fn_file[0]) else fn_file[0]
            for fn_file in items
        ]
        samples[sample] = [{
            'files':
            _check_paired(files, force_single, separators),
            'out_file':
            os.path.join(out_dir, sample + ext),
            'fn':
            fn,
            'anno':
            items[0][2:],
            'config':
            config,
            'name':
            sample,
            'out_dir':
            out_dir
        }]
    return [samples[sample] for sample in samples]
Beispiel #56
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            cmd = [sys.executable, config_utils.get_program("tophat", config)]
            for k, v in options.items():
                if v is True:
                    cmd.append("--%s" % k)
                else:
                    assert not isinstance(v, bool)
                    cmd.append("--%s=%s" % (k, v))
            # tophat requires options before arguments, otherwise it silently ignores them
            cmd += files
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file))
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
Beispiel #57
0
def _run_vardict_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect SNPs and indels with VarDict.

    var2vcf_valid uses -A flag which reports all alleles and improves sensitivity:
    https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        raw_file = "%s-raw%s" % utils.splitext_plus(out_file)
        with file_transaction(items[0], raw_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=False)
            num_bams = len(align_bams)
            sample_vcf_names = [
            ]  # for individual sample names, given batch calling may be required
            for bamfile, item in zip(align_bams, items):
                # prepare commands
                sample = dd.get_sample_name(item)
                vardict = get_vardict_command(items[0])
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = (" ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                        if _is_bed_file(target) else "")
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if tx_out_file.endswith(
                    "gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                setup = (
                    "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && "
                    % os.path.dirname(utils.Rscript_cmd()))
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {sample} -b {bamfile} {opts} "
                    "| {strandbias}"
                    "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} "
                    """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                    "| bcftools filter -i 'QUAL >= 0' "
                    "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}"
                )
                if num_bams > 1:
                    temp_file_prefix = raw_file.replace(".gz", "").replace(
                        ".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if raw_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        if not _is_bed_file(target):
                            vcfutils.write_empty_vcf(tx_tmp_file,
                                                     config,
                                                     samples=[sample])
                        else:
                            cmd += " > {tx_tmp_file}"
                            do.run(cmd.format(**locals()),
                                   "Genotyping with VarDict: Inference", {})
                else:
                    if not _is_bed_file(target):
                        vcfutils.write_empty_vcf(tx_out_file,
                                                 config,
                                                 samples=[sample])
                    else:
                        cmd += " > {tx_out_file}"
                        do.run(cmd.format(**locals()),
                               "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(
                    orig_files=sample_vcf_names,
                    out_file=tx_out_file,
                    ref_file=ref_file,
                    config=config,
                    region=bamprep.region_to_gatk(region))
        if assoc_files.get("dbsnp"):
            annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0],
                                 out_file)
        else:
            utils.symlink_plus(raw_file, out_file)
    return out_file
Beispiel #58
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        raw_file = "%s-raw%s" % utils.splitext_plus(out_file)
        with file_transaction(items[0], raw_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts = " ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if dd.get_avg_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        """| %s -c 'from bcbio.variation import freebayes; """
                        """freebayes.call_somatic("%s", "%s")' """ %
                        (sys.executable, paired.tumor_name,
                         paired.normal_name))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = (
                    "unset R_HOME && unset JAVA_HOME && export PATH=%s:$PATH && "
                    % os.path.dirname(utils.Rscript_cmd()))
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    """| {py_cl} -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "{ref_file}")' """
                    "{freq_filter} "
                    "| bcftools filter -i 'QUAL >= 0' "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
        if assoc_files.get("dbsnp"):
            annotation.add_dbsnp(raw_file, assoc_files["dbsnp"], items[0],
                                 out_file)
        else:
            utils.symlink_plus(raw_file, out_file)
    return out_file
Beispiel #59
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, config)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "%s.sam" % out_base)
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.sam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-convert-bam"] = True
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            tophat_runner = sh.Command(
                config_utils.get_program("tophat", config))
            ready_options = {}
            for k, v in options.iteritems():
                ready_options[k.replace("-", "_")] = v
            # tophat requires options before arguments,
            # otherwise it silently ignores them
            tophat_ready = tophat_runner.bake(**ready_options)
            cmd = str(tophat_ready.bake(*files))
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file),
                   None)
        _fix_empty_readnames(out_file)
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.sam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed = merge_unmapped(fixed, unmapped, config)
    fixed = _fix_unmapped(fixed, config, names)
    fixed = bam.sort(fixed, config)
    fixed = bam.bam_to_sam(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
Beispiel #60
0
def mutect_caller(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Run the MuTect paired analysis algorithm.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not file_exists(out_file):
        base_config = items[0]["config"]
        broad_runner = broad.runner_from_config(base_config, "mutect")
        out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf")
                           if "vcf" in out_file else out_file + "-mutect.vcf")
        broad_runner, params = \
            _mutect_call_prep(align_bams, items, ref_file, assoc_files,
                                   region, out_file_mutect)
        if (not isinstance(region, (list, tuple))
                and not all(has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
            return
        out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect)
        if not file_exists(out_file_orig):
            with file_transaction(config, out_file_orig) as tx_out_file:
                # Rationale: MuTect writes another table to stdout, which we don't need
                params += ["--vcf", tx_out_file, "-o", os.devnull]
                broad_runner.run_mutect(params)
        is_paired = "-I:normal" in params
        if not utils.file_uptodate(out_file_mutect, out_file_orig):
            out_file_mutect = _fix_mutect_output(out_file_orig, config,
                                                 out_file_mutect, is_paired)
        indelcaller = vcfutils.get_indelcaller(base_config)
        if ("scalpel" in indelcaller.lower() and region
                and isinstance(region, (tuple, list))
                and chromhacks.is_autosomal_or_sex(region[0])):
            # Scalpel InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if scalpel.is_installed(items[0]["config"]):
                if not is_paired:
                    vcfutils.check_paired_problems(items)
                    scalpel._run_scalpel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                else:
                    scalpel._run_scalpel_paired(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=items[0]["sam_ref"],
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif "pindel" in indelcaller.lower():
            from bcbio.structural import pindel
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            if pindel.is_installed(items[0]["config"]):
                pindel._run_tumor_pindel_caller(align_bams,
                                                items,
                                                ref_file,
                                                assoc_files,
                                                region=region,
                                                out_file=out_file_indels)
                out_file = vcfutils.combine_variant_files(
                    orig_files=[out_file_mutect, out_file_indels],
                    out_file=out_file,
                    ref_file=ref_file,
                    config=items[0]["config"],
                    region=region)
            else:
                utils.symlink_plus(out_file_mutect, out_file)
        elif (("somaticindeldetector" in indelcaller.lower()
               or "sid" in indelcaller.lower())
              and "appistry" in broad_runner.get_mutect_version()):
            # SomaticIndelDetector InDels
            out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf")
                               if "vcf" in out_file else out_file +
                               "-somaticIndels.vcf")
            params_indels = _SID_call_prep(align_bams, items, ref_file,
                                           assoc_files, region,
                                           out_file_indels)
            with file_transaction(config, out_file_indels) as tx_out_file:
                params_indels += ["-o", tx_out_file]
                broad_runner.run_mutect(params_indels)
            out_file = vcfutils.combine_variant_files(
                orig_files=[out_file_mutect, out_file_indels],
                out_file=out_file,
                ref_file=items[0]["sam_ref"],
                config=items[0]["config"],
                region=region)
        else:
            utils.symlink_plus(out_file_mutect, out_file)
    return out_file