Example #1
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file,
                                bam.is_paired(bam_fname),
                                target_file, target_file, None, data["config"])
        if utils.file_exists(hsmetric_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Example #2
0
def run(data):
    """Quantitaive isoforms expression by express"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat'))
    if not tophat_index:
        logger.info("Tophat index not found, skipping running eXpress.")
        return None
    tophat_fa = tophat_index.replace("ver", "fa")
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    safe_makedir(out_dir)
    express = config_utils.get_program("express", data['config'])
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return None
    if not file_exists(out_file):
        with tx_tmpdir() as tmp_dir:
            chdir(tmp_dir)
            ref_transcript = _do_fasta(tophat_fa)
            cmd = ("{express} {ref_transcript} {in_bam}")
            do.run(cmd.format(**locals()), "Run express", {})
            shutil.move("results.xprs", out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10)
    return (eff_count_file, tpm_file, fpkm_file)
Example #3
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Example #4
0
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific fastq input.
    """
    bin_dir = os.path.dirname(os.path.realpath(sys.executable))
    out_dir = utils.safe_makedir(out_dir)
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.join(bin_dir, "razers3")
            if not os.path.exists(razers3):
                raise ValueError("Could not find razers3 executable at %s" % (razers3))
            out_handle.write(CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        resources = config_utils.get_resources("optitype", data["config"])
        if resources.get("options"):
            opts = " ".join([str(x) for x in resources["options"]])
        else:
            opts = ""
        cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} "
                "-i {hla_fq} -c {config_file}")
        do.run(cmd.format(**locals()), "HLA typing with OptiType")
        for outf in os.listdir(tx_out_dir):
            shutil.move(os.path.join(tx_out_dir, outf), os.path.join(out_dir, outf))
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(out_file) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
Example #5
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Example #6
0
def _goleft_indexcov(bam_file, data, out_dir):
    """Use goleft indexcov to estimate coverage distributions using BAM index.

    Only used for whole genome runs as captures typically don't have enough data
    to be useful for index-only summaries.
    """
    if not dd.get_coverage_interval(data) == "genome":
        return []
    out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov"))
    out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext))
                 for ext in ["roc", "ped", "bed.gz"]]
    if not utils.file_uptodate(out_files[-1], bam_file):
        with transaction.tx_tmpdir(data) as tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data)))
            gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)]
            gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else ""
            cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}"
            try:
                do.run(cmd.format(**locals()), "QC: goleft indexcov")
            except subprocess.CalledProcessError as msg:
                if not ("indexcov: no usable" in str(msg) or
                        ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))):
                    raise
            for out_file in out_files:
                orig_file = os.path.join(tmp_dir, os.path.basename(out_file))
                if utils.file_exists(orig_file):
                    utils.copy_plus(orig_file, out_file)
    # MultiQC needs non-gzipped/BED inputs so unpack the file
    out_bed = out_files[-1].replace(".bed.gz", ".tsv")
    if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed):
        with transaction.file_transaction(data, out_bed) as tx_out_bed:
            cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed)
            do.run(cmd, "Unpack indexcov BED file")
    out_files[-1] = out_bed
    return [x for x in out_files if utils.file_exists(x)]
Example #7
0
 def run_mutect(self, params, tmp_dir=None):
     with tx_tmpdir(self._config) as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_mutect(params, tmp_dir)
         prog = "MuTect"
         do.run(cl, "MuTect: {0}".format(prog), None)
Example #8
0
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv, using speedseq pipeline.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    out_file = os.path.join(
        work_dir, "%s%s.vcf" %
        (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with tx_tmpdir(items[0]) as tmpdir:
                full_bams = ",".join(full_bams)
                sr_bams = ",".join(sr_bams)
                disc_bams = ",".join(disc_bams)
                exclude = "-x %s" % sv_exclude_bed if utils.file_exists(
                    sv_exclude_bed) else ""
                ref_file = dd.get_ref_file(items[0])
                # use our bcbio python for runs within lumpyexpress
                curpython_dir = os.path.dirname(sys.executable)
                cmd = (
                    "export PATH={curpython_dir}:$PATH && "
                    "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} "
                    "{exclude} -T {tmpdir} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "lumpyexpress", items[0])
    return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
Example #9
0
def _run_toplevel(config,
                  config_file,
                  work_dir,
                  parallel,
                  fc_dir=None,
                  run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_samples_with_pipelines(samples)
    final = []
    with tx_tmpdir(config) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel,
                                             config)
            versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs,
                                   pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Example #10
0
def _callable_intersect(in_file, callable_bed, data):
    """Return list of original VCF SVs intersected by callable regions.

    Does not try to handle BNDs. We should resolve these and return where possible.
    """
    with tx_tmpdir(data) as tmpdir:
        in_bed = os.path.join(
            tmpdir, "%s-convert.bed" %
            utils.splitext_plus(os.path.basename(in_file))[0])
        with utils.open_gzipsafe(in_file) as in_handle:
            with open(in_bed, "w") as out_handle:
                for parts in (l.split("\t") for l in in_handle
                              if not l.startswith("#")):
                    start, end = _get_start_end(parts)
                    if end:
                        out_handle.write("\t".join([parts[0], start, end] +
                                                   parts) + "\n")
        out_file = os.path.join(
            tmpdir, "%s-subset.tsv" %
            utils.splitext_plus(os.path.basename(in_file))[0])
        cmd = "bedtools intersect -a {in_bed} -b {callable_bed} -wa -wb > {out_file}"
        do.run(cmd.format(**locals()), "Intersect VCF by callable")
        with open(out_file) as in_handle:
            for line in in_handle:
                yield line.rstrip().split("\t")[3:]
Example #11
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("sambamba", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper()
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with tx_tmpdir(data) as tmpdir:
            with file_transaction(data, sr_file) as tx_sr_file:
                with file_transaction(data, disc_file) as tx_disc_file:
                    with file_transaction(data, dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(
                            data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file
                        )
                        out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0])
                        cmd = (
                            "{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | "
                        )
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Example #12
0
def _run_toplevel(config,
                  config_file,
                  work_dir,
                  parallel,
                  fc_dir=None,
                  run_info_yaml=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    logger.info("System YAML configuration: %s" % os.path.abspath(config_file))
    dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    pipelines, config = _pair_samples_with_pipelines(run_info_yaml, config)
    system.write_info(dirs, parallel, config)
    with tx_tmpdir(config if parallel.get("type") ==
                   "local" else None) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, samples in pipelines.items():
            for xs in pipeline(config, run_info_yaml, parallel, dirs, samples):
                pass
Example #13
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-")
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("unset JAVA_HOME && "
                       "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Example #14
0
 def test_create_tmpdir_in_a_specified_base_dir(self, mock_io):
     with tx_tmpdir(base_dir='somedir'):
         pass
     transaction.utils.get_abspath.assert_called_once_with(
         'somedir/bcbiotx')
     transaction.utils.safe_makedir.assert_called_once_with(
         transaction.utils.get_abspath.return_value)
Example #15
0
    def run_gatk(self, params, tmp_dir=None, log_error=True,
                 data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False):
        """Top level interface to running a GATK command.

        ld_preload injects required libraries for Java JNI calls:
        https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow
        """
        needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6")
        # For old Java requirements use global java 7
        if needs_java7:
            setpath.remove_bcbiopath()
        with tx_tmpdir(self._config) as local_tmp_dir:
            if tmp_dir is None:
                tmp_dir = local_tmp_dir
            cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc)
            atype_index = params.index("-T") if params.count("-T") > 0 \
                          else params.index("--analysis_type")
            prog = params[atype_index + 1]
            cl = fix_missing_spark_user(cl, prog, params)
            if ld_preload:
                cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl)
            do.run(cl, "GATK: {0}".format(prog), data, region=region,
                   log_error=log_error)
        if needs_java7:
            setpath.prepend_bcbiopath()
Example #16
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cmd = ["batch"] + test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", raw_work_dir, "--split",
                   "-p", str(tz.get_in(["config", "algorithm", "num_cores"], data, 1)),
                   "--output-reference", os.path.join(raw_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            args = cnvlib_cmd.parse_args(cmd)
            args.func(args)
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Example #17
0
def _mirtop(input_fn, sps, db, out_dir, config):
    """
    Convert to GFF3 standard format
    """
    hairpin = os.path.join(db, "hairpin.fa")
    gtf = os.path.join(db, "mirbase.gff3")
    if not file_exists(hairpin) or not file_exists(gtf):
        logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf))
        return None
    out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0]
    out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0]
    export = _get_env()
    cmd = ("{export} mirtop gff  --sps {sps} --hairpin {hairpin} "
           "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}")
    if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \
       not file_exists(os.path.join(out_dir, out_gff_fn)):
        with tx_tmpdir() as out_tx:
            do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn)
            with utils.chdir(out_tx):
                out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \
                                    else out_gff_fn
                if utils.file_exists(out_fn):
                    shutil.move(os.path.join(out_tx, out_fn),
                                os.path.join(out_dir, out_fn))
    out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \
                        else os.path.join(out_dir, out_gff_fn)
    if utils.file_exists(os.path.join(out_dir, out_fn)):
        return os.path.join(out_dir, out_fn)
Example #18
0
def run(bam_file, data, out_dir):
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    broad_runner = broad.PicardCmdRunner("picard", data["config"])
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    out_base = utils.splitext_plus(os.path.basename(bam_fname))[0]
    hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base)
    hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base)
    if not utils.file_exists(hsmetric_file) and not utils.file_exists(
            hsinsert_file):
        with utils.chdir(out_dir):
            with tx_tmpdir() as tmp_dir:
                cur_bam = os.path.basename(bam_fname)
                if not os.path.exists(cur_bam):
                    os.symlink(bam_fname, cur_bam)
                gen_metrics = PicardMetrics(broad_runner, tmp_dir)
                gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname),
                                   target_file, target_file, None,
                                   data["config"])
        if utils.file_exists(hsmetric_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsmetric_file), "")
        if utils.file_exists(hsinsert_file):
            do.run(
                "sed -i 's/%s.bam//g' %s" %
                (out_base.replace(sample, ""), hsinsert_file), "")
    return hsmetric_file
Example #19
0
def piped_bamprep(data, region=None, out_file=None):
    """Perform full BAM preparation using pipes to avoid intermediate disk IO.

    Handles recalibration and realignment of original BAMs.
    """
    data["region"] = region
    if not _need_prep(data):
        return [data]
    else:
        utils.safe_makedir(os.path.dirname(out_file))
        if region[0] == "nochrom":
            prep_bam = shared.write_nochr_reads(data["work_bam"], out_file,
                                                data["config"])
        elif region[0] == "noanalysis":
            prep_bam = shared.write_noanalysis_reads(data["work_bam"],
                                                     region[1], out_file,
                                                     data["config"])
        else:
            if not utils.file_exists(out_file):
                with tx_tmpdir(data) as tmp_dir:
                    _piped_bamprep_region(data, region, out_file, tmp_dir)
            prep_bam = out_file
        bam.index(prep_bam, data["config"])
        data["work_bam"] = prep_bam
        return [data]
Example #20
0
def _call_hla(hla_fq, out_dir, data):
    """Run OptiType HLA calling for a specific fastq input.
    """
    bin_dir = os.path.dirname(os.path.realpath(sys.executable))
    out_dir = utils.safe_makedir(out_dir)
    with tx_tmpdir(data, os.path.dirname(out_dir)) as tx_out_dir:
        config_file = os.path.join(tx_out_dir, "config.ini")
        with open(config_file, "w") as out_handle:
            razers3 = os.path.join(bin_dir, "razers3")
            if not os.path.exists(razers3):
                raise ValueError("Could not find razers3 executable at %s" %
                                 (razers3))
            out_handle.write(
                CONFIG_TMPL.format(razers3=razers3, cores=dd.get_cores(data)))
        resources = config_utils.get_resources("optitype", data["config"])
        if resources.get("options"):
            opts = " ".join([str(x) for x in resources["options"]])
        else:
            opts = ""
        cmd = ("OptiTypePipeline.py -v --dna {opts} -o {tx_out_dir} "
               "-i {hla_fq} -c {config_file}")
        do.run(cmd.format(**locals()), "HLA typing with OptiType")
        for outf in os.listdir(tx_out_dir):
            shutil.move(os.path.join(tx_out_dir, outf),
                        os.path.join(out_dir, outf))
    out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv"))
    assert len(
        out_file
    ) == 1, "Expected one result file for OptiType, found %s" % out_file
    return out_file[0]
Example #21
0
def _trna_annotation(data):
    """
    use tDRmapper to quantify tRNAs
    """
    trna_ref = op.join(dd.get_srna_trna_file(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "trna", name))
    in_file = op.basename(data["clean_fastq"])
    tdrmapper = os.path.join(os.path.dirname(sys.executable),
                             "TdrMappingScripts.pl")
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_ref) or not file_exists(tdrmapper):
        logger.info("There is no tRNA annotation to run TdrMapper.")
        return work_dir
    out_file = op.join(work_dir, in_file + ".hq_cs.mapped")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"],
                                   op.join(txdir, in_file))
                cmd = ("{perl_export} && perl {tdrmapper} {trna_ref} {in_file}"
                       ).format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*mapped*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #22
0
 def run_mutect(self, params, tmp_dir=None):
     with tx_tmpdir(self._config) as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_mutect(params, tmp_dir)
         prog = "MuTect"
         do.run(cl, "MuTect: {0}".format(prog), None)
Example #23
0
def _extract_split_and_discordants(in_bam, work_dir, data):
    """Retrieve split-read alignments from input BAM file.
    """
    dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0])
    samtools = config_utils.get_program("samtools", data["config"])
    cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1)
    resources = config_utils.get_resources("samtools", data["config"])
    mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                     3, "decrease").upper()
    if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file):
        with tx_tmpdir(data) as tmpdir:
            with file_transaction(data, sr_file) as tx_sr_file:
                with file_transaction(data, disc_file) as tx_disc_file:
                    with file_transaction(data, dedup_file) as tx_dedup_file:
                        samblaster_cl = postalign.samblaster_dedup_sort(data, tx_dedup_file,
                                                                        tx_sr_file, tx_disc_file)
                        out_base = os.path.join(tmpdir,
                                                "%s-namesort" % os.path.splitext(os.path.basename(in_bam))[0])
                        cmd = ("{samtools} sort -n -@ {cores} -m {mem} -O sam -T {out_base} {in_bam} | ")
                        cmd = cmd.format(**locals()) + samblaster_cl
                        do.run(cmd, "samblaster: split and discordant reads", data)
    for fname in [sr_file, disc_file, dedup_file]:
        bam.index(fname, data["config"])
    return dedup_file, sr_file, disc_file
Example #24
0
 def test_makes_unique_tmp_dir(self, mock_io):
     """Test that tx_tmpdir creates a tmp dir unique name
     using `tempfile.mkdtemp` inside the base dir."""
     with tx_tmpdir(None):
         pass
     transaction.tempfile.mkdtemp.assert_called_once_with(
         dir=transaction.utils.get_abspath.return_value)
Example #25
0
    def run_gatk(self, params, tmp_dir=None, log_error=True,
                 data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False):
        """Top level interface to running a GATK command.

        ld_preload injects required libraries for Java JNI calls:
        https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow
        """
        needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6")
        # For old Java requirements use global java 7
        if needs_java7:
            setpath.remove_bcbiopath()
        with tx_tmpdir(self._config) as local_tmp_dir:
            if tmp_dir is None:
                tmp_dir = local_tmp_dir
            cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc)
            atype_index = params.index("-T") if params.count("-T") > 0 \
                          else params.index("--analysis_type")
            prog = params[atype_index + 1]
            cl = fix_missing_spark_user(cl, prog, params)
            if ld_preload:
                cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl)
            do.run(cl, "GATK: {0}".format(prog), data, region=region,
                   log_error=log_error)
        if needs_java7:
            setpath.prepend_bcbiopath()
Example #26
0
 def run_gatk(self,
              params,
              tmp_dir=None,
              log_error=True,
              data=None,
              region=None,
              memscale=None,
              parallel_gc=False):
     needs_java7 = LooseVersion(
         self.get_gatk_version()) < LooseVersion("3.6")
     # For old Java requirements use global java 7
     if needs_java7:
         setpath.remove_bcbiopath()
     with tx_tmpdir(self._config) as local_tmp_dir:
         if tmp_dir is None:
             tmp_dir = local_tmp_dir
         cl = self.cl_gatk(params,
                           tmp_dir,
                           memscale=memscale,
                           parallel_gc=parallel_gc)
         atype_index = params.index("-T") if params.count("-T") > 0 \
                       else params.index("--analysis_type")
         prog = params[atype_index + 1]
         do.run(cl,
                "GATK: {0}".format(prog),
                data,
                region=region,
                log_error=log_error)
     if needs_java7:
         setpath.prepend_bcbiopath()
Example #27
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Example #28
0
def coverage(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(out_dir)
    if not bed_file:
        return None
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = (
                        "{sambamba} depth region -F \"not unmapped\" -t {cores} "
                        "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                        "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                        "chrom/chrom/' > {out_tx}")
                    do.run(
                        cmd.format(**locals()) % "-C 1000",
                        "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file),
                                            sample)
    return os.path.abspath(parse_file)
Example #29
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with tx_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Example #30
0
def priority_coverage(data, out_dir):
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return data

    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)
        with file_transaction(out_file) as tx_out_file:
            parse_cmd = "awk '{print $1\"\t\"$2\"\t\"$2\"\t\"$3\"\t\"$10}' | sed '1d'"
            cmd = ("{sambamba} depth base -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "{in_bam} | {parse_cmd} > {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    return out_file
Example #31
0
def _run_cnvkit_shared(data,
                       test_bams,
                       background_bams,
                       work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(
        test_bams[0]))[0].split(".")[0]

    background_cnn = "%s_background.cnn" % (background_name
                                            if background_name else "flat")
    files = {
        "cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
        "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
        "back_cnn": os.path.join(raw_work_dir, background_cnn)
    }
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            cov_interval = dd.get_coverage_interval(data)
            raw_target_bed, access_bed = _get_target_access_files(
                cov_interval, data, work_dir)
            # bail out if we ended up with no regions
            if not utils.file_exists(raw_target_bed):
                return {}
            target_bed = annotate.add_genes(raw_target_bed, data)

            # Do not paralleize cnvkit due to current issues with multi-processing
            cores = 1
            # cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
            #             len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_bed] + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            if cov_interval not in ["amplicon", "genome"]:
                at_avg, at_min, t_avg = _get_antitarget_size(
                    access_bed, target_bed)
                if at_avg:
                    cmd += [
                        "--antitarget-avg-size",
                        str(at_avg), "--antitarget-min-size",
                        str(at_min), "--target-avg-size",
                        str(t_avg)
                    ]
            local_sitelib = os.path.join(
                install.get_defaults().get("tooldir", "/usr/local"), "lib",
                "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Example #32
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Example #33
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    if not utils.file_exists(os.path.join(raw_work_dir, "%s.cnr" % out_base)):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            target_bed = tz.get_in(["config", "algorithm", "variant_regions"], data)
            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  ["--targets", target_bed, "--access", access_file,
                   "-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    return {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
            "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
            "back_cnn": os.path.join(raw_work_dir, background_cnn)}
Example #34
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform realignment of input BAM file; uses unix pipes for avoid IO.
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G").upper()
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with tx_tmpdir(data, base_dir=align_dir) as work_dir:
            with postalign.tobam_cl(data, out_file,
                                    bam.is_paired(in_bam)) as (tobam_cl,
                                                               tx_out_file):
                rg_info = get_rg_info(names)
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = (
                    "unset JAVA_HOME && "
                    "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                    "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                    "  -F BAMPE -c {num_cores} {extra_novo_args} | ")
                cmd = (cmd + tobam_cl).format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None, [
                    do.file_nonempty(tx_out_file),
                    do.file_reasonable_size(tx_out_file, in_bam)
                ])
    return out_file
Example #35
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(
        os.path.join(work_dir, "run_ploidy%s" % ploidy))
    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = (
                    "{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                    "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir}"
                )
                do.run(
                    cmd.format(**locals()),
                    "TitanCNA CNV detection: ploidy %s, cluster %s" %
                    (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(
                    os.path.join(tmp_dir, "Rplots.pdf"),
                    os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Example #36
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return data
    cleaned_bed = os.path.splitext(
        os.path.basename(bed_file))[0] + ".cleaned.bed"

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
                cleaned_bed = bed.decomment(bed_file, cleaned_bed)
                with file_transaction(parse_file) as out_tx:
                    cmd = (
                        "sambamba depth region -F \"not unmapped\" -t {cores} "
                        "-C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                        "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                        "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, bed_file, sample)
        _calculate_percentiles(parse_file, sample)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Example #37
0
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir,
               items):
    """Run lumpy-sv, using speedseq pipeline.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    out_file = os.path.join(
        work_dir, "%s%s.vcf" %
        (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with tx_tmpdir(items[0]) as tmpdir:
                full_bams = ",".join(full_bams)
                sr_bams = ",".join(sr_bams)
                disc_bams = ",".join(disc_bams)
                exclude = "-x %s" % sv_exclude_bed if (
                    sv_exclude_bed
                    and utils.file_exists(sv_exclude_bed)) else ""
                ref_file = dd.get_ref_file(items[0])
                depths = []
                for sample, ev_files in previous_evidence.items():
                    for ev_type, ev_file in ev_files.items():
                        if utils.file_exists(ev_file):
                            depths.append("%s:%s" % (sample, ev_file))
                depth_arg = "-d %s" % ",".join(depths) if len(
                    depths) > 0 else ""
                # use our bcbio python for runs within lumpyexpress
                exports = utils.local_path_export()
                cmd = (
                    "{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} "
                    "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "lumpyexpress", items[0])
    return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
Example #38
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Example #39
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file,
                               bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    return hsmetric_file
Example #40
0
def _gids_to_genes(gids, ssm_locs, cnv_ssms, data):
    """Convert support ids for SNPs and SSMs into associated genes.
    """
    locs = collections.defaultdict(set)
    for gid in gids:
        cur_locs = []
        try:
            cur_locs.append(ssm_locs[gid])
        except KeyError:
            for ssm_loc in cnv_ssms.get(gid, []):
                cur_locs.append(ssm_locs[ssm_loc])
        for chrom, pos in cur_locs:
            locs[chrom].add(pos)
    genes = set([])
    with tx_tmpdir(data) as tmpdir:
        chrom_prefix = "chr" if next(ref.file_contigs(dd.get_ref_file(data))).name.startswith("chr") else ""
        loc_file = os.path.join(tmpdir, "battenberg_find_genes.bed")
        with open(loc_file, "w") as out_handle:
            for chrom in sorted(locs.keys()):
                for loc in sorted(list(locs[chrom])):
                    out_handle.write("%s%s\t%s\t%s\n" % (chrom_prefix, chrom, loc - 1, loc))
        ann_file = annotate.add_genes(loc_file, data, max_distance=10000)
        for r in pybedtools.BedTool(ann_file):
            for gene in r.name.split(","):
                if gene != ".":
                    genes.add(gene)
    return sorted(list(genes))
Example #41
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    name = dd.get_sample_name(data)
    work_dir = os.path.join(dd.get_work_dir(data), "trna_mint", name)
    if not dd.get_srna_mint_lookup(data):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #42
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    "-T",
                    "PrintReads",
                    "-R",
                    ref_file,
                    "-I",
                    in_bam,
                    "--out",
                    tx_out_file,
                    "--filter_mismatching_base_and_quals",
                    "--filter_bases_not_stored",
                    "--filter_reads_with_N_cigar",
                ]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params
                do.run(cmd, "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Example #43
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #44
0
def run(data):
    """Proxy function to run the tool"""
    sample = data[0][0]
    work_dir = dd.get_work_dir(sample)
    out_dir = os.path.join(work_dir, "mirge")
    lib = _find_lib(sample)
    mirge = _find_mirge(sample)
    bowtie = _find_bowtie(sample)
    sps = dd.get_species(sample)
    species = SPS.get(sps, "")
    if not species:
        raise ValueError(
            "species not supported (hsa, mmu, rno, dre, cel, dme): %s" % sps)
    if not lib:
        raise ValueError(
            "-lib option is not set up in resources for mirge tool."
            " Read above warnings lines.")

    if not utils.file_exists(out_dir):
        with tx_tmpdir() as tmp_dir:
            sample_file = _create_sample_file(data, tmp_dir)
            do.run(_cmd().format(**locals()), "Running miRge2.0.")
            shutil.move(tmp_dir, out_dir)
    return [
        os.path.abspath(fn)
        for fn in glob.glob(os.path.join(out_dir, "*", "*"))
    ]
Example #45
0
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    return out_file
Example #46
0
def align_bam(in_bam, ref_file, names, align_dir, data):
    """Perform direct alignment of an input BAM file with BWA using pipes.

    This avoids disk IO by piping between processes:
     - samtools sort of input BAM to queryname
     - bedtools conversion to interleaved FASTQ
     - bwa-mem alignment
     - samtools conversion to BAM
     - samtools sort to coordinate
    """
    config = data["config"]
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    bedtools = config_utils.get_program("bedtools", config)
    resources = config_utils.get_resources("samtools", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    # adjust memory for samtools since used for input and output
    max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                         3, "decrease").upper()
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file):
                bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-")
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                prefix1 = "%s-in1" % tx_out_prefix
                cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
                       "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                       "| {bwa_cmd} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)])
    return out_file
Example #47
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage"))
    if not bed_file:
        return data
    cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed")
    cleaned_bed = bed.decomment(bed_file, cleaned_bed)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        _calculate_percentiles(os.path.abspath(parse_file), sample)
        data['coverage'] = os.path.abspath(parse_file)
    return data
Example #48
0
def _goleft_indexcov(bam_file, data, out_dir):
    """Use goleft indexcov to estimate coverage distributions using BAM index.

    Only used for whole genome runs as captures typically don't have enough data
    to be useful for index-only summaries.
    """
    if not dd.get_coverage_interval(data) == "genome":
        return []
    out_dir = utils.safe_makedir(os.path.join(out_dir, "indexcov"))
    out_files = [os.path.join(out_dir, "%s-indexcov.%s" % (dd.get_sample_name(data), ext))
                 for ext in ["roc", "ped", "bed.gz"]]
    if not utils.file_uptodate(out_files[-1], bam_file):
        with transaction.tx_tmpdir(data) as tmp_dir:
            tmp_dir = utils.safe_makedir(os.path.join(tmp_dir, dd.get_sample_name(data)))
            gender_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_sex(x.name)]
            gender_args = "--sex %s" % (",".join(gender_chroms)) if gender_chroms else ""
            cmd = "goleft indexcov --directory {tmp_dir} {gender_args} -- {bam_file}"
            try:
                do.run(cmd.format(**locals()), "QC: goleft indexcov")
            except subprocess.CalledProcessError as msg:
                if not ("indexcov: no usable" in str(msg) or
                        ("indexcov: expected" in str(msg) and "sex chromosomes, found:" in str(msg))):
                    raise
            for out_file in out_files:
                orig_file = os.path.join(tmp_dir, os.path.basename(out_file))
                if utils.file_exists(orig_file):
                    utils.copy_plus(orig_file, out_file)
    # MultiQC needs non-gzipped/BED inputs so unpack the file
    out_bed = out_files[-1].replace(".bed.gz", ".tsv")
    if utils.file_exists(out_files[-1]) and not utils.file_exists(out_bed):
        with transaction.file_transaction(data, out_bed) as tx_out_bed:
            cmd = "gunzip -c %s > %s" % (out_files[-1], tx_out_bed)
            do.run(cmd, "Unpack indexcov BED file")
    out_files[-1] = out_bed
    return [x for x in out_files if utils.file_exists(x)]
Example #49
0
def dedup_bam(in_bam, data):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if _check_dedup(data):
        out_file = os.path.join(
            utils.safe_makedir(
                os.path.join(os.getcwd(), "align", dd.get_sample_name(data))),
            "%s-dedup%s" % utils.splitext_plus(os.path.basename(in_bam)))
        if not utils.file_exists(out_file):
            with tx_tmpdir(data) as tmpdir:
                with file_transaction(data, out_file) as tx_out_file:
                    bammarkduplicates = config_utils.get_program(
                        "bammarkduplicates", data["config"])
                    base_tmp = os.path.join(
                        tmpdir,
                        os.path.splitext(os.path.basename(tx_out_file))[0])
                    cores, mem = _get_cores_memory(data, downscale=2)
                    cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
                           "markthreads={cores} I={in_bam} O={tx_out_file}")
                    do.run(cmd.format(**locals()),
                           "De-duplication with biobambam")
        bam.index(out_file, data["config"])
        return out_file
    else:
        return in_bam
Example #50
0
def _run_toplevel(config, config_file, work_dir, parallel,
                  fc_dir=None, run_info_yaml=None, samples=None):
    """
    Run toplevel analysis, processing a set of input files.
    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    samples -- Pre-processed samples, useful if run inside of docker containers.
    """
    parallel = log.create_base_logger(config, parallel)
    log.setup_local_logging(config, parallel)
    dirs = run_info.setup_directories(work_dir, fc_dir, config, config_file)
    config_file = os.path.join(dirs["config"], os.path.basename(config_file))
    if samples:
        dockerized = True
    else:
        dockerized = False
        samples = run_info.organize(dirs, config, run_info_yaml)
    pipelines = _pair_samples_with_pipelines(samples)
    final = []
    with tx_tmpdir(config) as tmpdir:
        tempfile.tempdir = tmpdir
        for pipeline, pipeline_items in pipelines.items():
            pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config)
            if not dockerized:
                versioncheck.testall(pipeline_items)
            for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items):
                if len(xs) == 1:
                    upload.from_sample(xs[0])
                    final.append(xs[0])
Example #51
0
def _mirtop(input_fn, sps, db, out_dir, config):
    """
    Convert to GFF3 standard format
    """
    hairpin = os.path.join(db, "hairpin.fa")
    gtf = os.path.join(db, "mirbase.gff3")
    if not file_exists(hairpin) or not file_exists(gtf):
        logger.warning("%s or %s are not installed. Skipping." %
                       (hairpin, gtf))
        return None
    out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0]
    out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0]
    export = _get_env()
    cmd = ("{export} mirtop gff  --sps {sps} --hairpin {hairpin} "
           "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}")
    if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \
       not file_exists(os.path.join(out_dir, out_gff_fn)):
        with tx_tmpdir() as out_tx:
            do.run(cmd.format(**locals()),
                   "Do miRNA annotation for %s" % input_fn)
            with utils.chdir(out_tx):
                out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \
                                    else out_gff_fn
                if utils.file_exists(out_fn):
                    shutil.move(os.path.join(out_tx, out_fn),
                                os.path.join(out_dir, out_fn))
    out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \
                        else os.path.join(out_dir, out_gff_fn)
    if utils.file_exists(os.path.join(out_dir, out_fn)):
        return os.path.join(out_dir, out_fn)
Example #52
0
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items):
    """Run lumpy-sv, using speedseq pipeline.
    """
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    out_file = os.path.join(work_dir, "%s%s.vcf"
                            % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with tx_tmpdir(items[0]) as tmpdir:
                full_bams = ",".join(full_bams)
                sr_bams = ",".join(sr_bams)
                disc_bams = ",".join(disc_bams)
                exclude = "-x %s" % sv_exclude_bed if (sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else ""
                ref_file = dd.get_ref_file(items[0])
                depths = []
                for sample, ev_files in previous_evidence.items():
                    for ev_type, ev_file in ev_files.items():
                        if utils.file_exists(ev_file):
                            depths.append("%s:%s" % (sample, ev_file))
                depth_arg = "-d %s" % ",".join(depths) if len(depths) > 0 else ""
                # use our bcbio python for runs within lumpyexpress
                exports = utils.local_path_export()
                cmd = ("{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} "
                       "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}")
                do.run(cmd.format(**locals()), "lumpyexpress", items[0])
    return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
Example #53
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
        if pair_file:
            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = cmd.format(**locals()) + tobam_cl
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Example #54
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}")
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
Example #55
0
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir,
                       background_name=None):
    """Shared functionality to run CNVkit.
    """
    ref_file = dd.get_ref_file(data)
    raw_work_dir = os.path.join(work_dir, "raw")
    out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0]
    background_cnn = "%s_background.cnn" % (background_name if background_name else "flat")
    files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base),
             "cns": os.path.join(raw_work_dir, "%s.cns" % out_base),
             "back_cnn": os.path.join(raw_work_dir, background_cnn)}
    if not utils.file_exists(files["cnr"]):
        if os.path.exists(raw_work_dir):
            shutil.rmtree(raw_work_dir)
        with tx_tmpdir(data, work_dir) as tx_work_dir:
            # pick targets, anti-targets and access files based on analysis type
            # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html
            cov_interval = dd.get_coverage_interval(data)
            base_regions = dd.get_variant_regions(data)
            # For genome calls, subset to regions within 10kb of genes
            if cov_interval == "genome":
                base_regions = annotate.subset_by_genes(base_regions, data,
                                                        work_dir, pad=1e4)

            raw_target_bed = bedutils.merge_overlaps(base_regions, data,
                                                     out_dir=work_dir)
            target_bed = annotate.add_genes(raw_target_bed, data)

            # bail out if we ended up with no regions
            if not utils.file_exists(target_bed):
                return {}

            if cov_interval == "amplicon":
                target_opts = ["--targets", target_bed, "--access", target_bed]
            elif cov_interval == "genome":
                target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)]
            else:
                target_opts = ["--targets", target_bed, "--access", access_file]

            cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1),
                        len(test_bams) + len(background_bams))
            cmd = [_get_cmd(), "batch"] + \
                  test_bams + ["-n"] + background_bams + ["-f", ref_file] + \
                  target_opts + \
                  ["-d", tx_work_dir, "--split", "-p", str(cores),
                   "--output-reference", os.path.join(tx_work_dir, background_cnn)]
            at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed)
            if at_avg:
                cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min),
                        "--target-avg-size", str(t_avg)]
            local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"),
                                         "lib", "R", "site-library")
            cmd += ["--rlibpath", local_sitelib]
            do.run(cmd, "CNVkit batch")
            shutil.move(tx_work_dir, raw_work_dir)
    for ftype in ["cnr", "cns"]:
        if not os.path.exists(files[ftype]):
            raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype]))
    return files
Example #56
0
 def test_makes_base_tmp_dir(self, mock_io):
     """"
     Test that tx_tmpdir creates a base temporary directory
     """
     with tx_tmpdir(None):
         pass
     transaction.utils.safe_makedir.assert_called_once_with(
         transaction.utils.get_abspath.return_value)
Example #57
0
def bedtools_tmpdir(data):
    with tx_tmpdir(data) as tmpdir:
        orig_tmpdir = tempfile.gettempdir()
        pybedtools.set_tempdir(tmpdir)
        yield
        if orig_tmpdir and os.path.exists(orig_tmpdir):
            pybedtools.set_tempdir(orig_tmpdir)
        else:
            tempfile.tempdir = None
Example #58
0
 def test_gets_base_tmpdir_name_from_config_or_cwd(self, mock_io, mocker):
     mocker.patch('bcbio.distributed.transaction._get_base_tmpdir')
     data  = mock.Mock()
     with tx_tmpdir(data):
         pass
     cwd = transaction.os.getcwd.return_value
     transaction._get_base_tmpdir.assert_called_once_with(
         data, cwd)
     base_tmpdir = transaction._get_base_tmpdir.return_value
     transaction.utils.get_abspath.assert_called_once_with(base_tmpdir)