Example #1
def run(data):
    """Quantitaive isoforms expression by express"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    tophat_index = get_in(data, ('genome_resources', 'rnaseq', 'transcriptome_index', 'tophat'))
    if not tophat_index:
        logger.info("Tophat index not found, skipping running eXpress.")
        return None
    tophat_fa = tophat_index.replace("ver", "fa")
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return None
    if not file_exists(out_file):
        with tx_tmpdir() as tmp_dir:
            ref_transcript = _do_fasta(tophat_fa)
            cmd = ("{express} {ref_transcript} {in_bam}")
            do.run(cmd.format(**locals()), "Run express", {})
            shutil.move("results.xprs", out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs","fpkm"), 10)
    return (eff_count_file, tpm_file, fpkm_file)
def report_summary(samples, run_parallel):
    Run coverage report with bcbiocov package
    work_dir = dd.get_work_dir(samples[0][0])

    parent_dir = utils.safe_makedir(os.path.join(work_dir, "report"))
    qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma")
    with utils.chdir(parent_dir):

        logger.info("copy qsignature")
        if qsignature_fn:
            if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):

        out_dir = utils.safe_makedir("coverage")
        out_dir = utils.safe_makedir("variants")
        samples = run_parallel("coverage_report", samples)

            import bcbreport.prepare as bcbreport
            logger.info("skipping report. No bcbreport installed.")

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

    return samples
Example #3
def coverage(data):
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with file_transaction(parse_file) as out_tx:
                cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file}  {in_bam} | sed 's/# chrom/chrom/' > {parse_file}")
                do.run(cmd.format(**locals()), "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, bed_file, sample)
        _calculate_percentiles(parse_file, sample)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Example #4
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir(config) as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    return out_file
Example #5
def _generate_qseq(bc_dir, config):
    """Generate qseq files from illumina bcl files if not present.

    More recent Illumina updates do not produce qseq files. Illumina's
    offline base caller (OLB) generates these starting with bcl,
    intensity and filter files.
    if not os.path.exists(os.path.join(bc_dir, "finished.txt")):
        bcl2qseq_log = os.path.join(config["log_dir"], "setupBclToQseq.log")
        cmd = os.path.join(config["program"]["olb"], "bin", "setupBclToQseq.py")
        cl = [cmd, "-L", bcl2qseq_log, "-o", bc_dir, "--in-place", "--overwrite",
              "--ignore-missing-stats", "--ignore-missing-control"]
        # in OLB version 1.9, the -i flag changed to intensities instead of input
        version_cl = [cmd, "-v"]
        p = subprocess.Popen(version_cl, stdout=subprocess.PIPE)
        (out, _) = p.communicate()
        olb_version = float(out.strip().split()[-1].rsplit(".", 1)[0])
        if olb_version > 1.8:
            cl += ["-P", ".clocs"]
            cl += ["-b", bc_dir]
            cl += ["-i", bc_dir, "-p", os.path.split(bc_dir)[0]]
        with utils.chdir(bc_dir):
            processors = config["algorithm"].get("num_cores", 8)
            cl = config["program"].get("olb_make", "make").split() + ["-j", str(processors)]
Example #6
def _download_prepped_genome(genome_build, data, name, need_remap):
    """Get a pre-prepared genome from S3, unpacking it locally.

    Supports runs on AWS where we can retrieve the resources on demand.
    out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                              "inputs", "data", "genomes"))
    ref_dir = os.path.join(out_dir, genome_build, REMAP_NAMES.get(name, name))
    if not os.path.exists(ref_dir):
        target = REMAP_NAMES.get(name, name)
        if target in INPLACE_INDEX:
            ref_file = glob.glob(os.path.normpath(os.path.join(ref_dir, os.pardir, "seq", "*.fa")))[0]
            INPLACE_INDEX[target](ref_file, ref_dir, data)
            with utils.chdir(out_dir):
                bucket = S3_INFO["bucket"]
                key = S3_INFO["key"].format(build=genome_build, target=REMAP_NAMES.get(name, name))
                cmd = ("gof3r get --no-md5 -k {key} -b {bucket} | pigz -d -c | tar -xvp")
                do.run(cmd.format(**locals()), "Download pre-prepared genome data: %s" % genome_build)
    genome_dir = os.path.join(out_dir, genome_build)
    genome_build = genome_build.replace("-test", "")
    if need_remap or name == "samtools":
        return os.path.join(genome_dir, "seq", "%s.fa" % genome_build)
        ref_dir = os.path.join(genome_dir, REMAP_NAMES.get(name, name))
        base_name = os.path.commonprefix(os.listdir(ref_dir))
        while base_name.endswith("."):
            base_name = base_name[:-1]
        return os.path.join(ref_dir, base_name)
Example #7
def copy_flowcell(dname, fastq_dir, sample_cfile, config):
    """Copy required files for processing using rsync, potentially to a remote server.
    with utils.chdir(dname):
        reports = reduce(operator.add,
                          ["Data/Intensities/BaseCalls/Plots", "Data/reports",
                           "Data/Status.htm", "Data/Status_Files", "InterOp"]])
        run_info = reduce(operator.add,
        fastq = glob.glob(os.path.join(fastq_dir.replace(dname + "/", "", 1),
        configs = [sample_cfile.replace(dname + "/", "", 1)]
    include_file = os.path.join(dname, "transfer_files.txt")
    with open(include_file, "w") as out_handle:
        out_handle.write("+ */\n")
        for fname in configs + fastq + run_info + reports:
            out_handle.write("+ %s\n" % fname)
        out_handle.write("- *\n")
    # remote transfer
    if utils.get_in(config, ("process", "host")):
        dest = "%s@%s:%s" % (utils.get_in(config, ("process", "username")),
                             utils.get_in(config, ("process", "host")),
                             utils.get_in(config, ("process", "dir")))
    # local transfer
        dest = utils.get_in(config, ("process", "dir"))
    cmd = ["rsync", "-akmrtv", "--include-from=%s" % include_file, dname, dest]
    logger.info("Copying files to analysis machine")
    logger.info(" ".join(cmd))
Example #8
def _generate_fastq(fc_dir, config, compress_fastq):
    """Generate fastq files for the current flowcell.
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    postprocess_dir = config.get("postprocess_dir", "")
    if postprocess_dir:
        fastq_dir = os.path.join(postprocess_dir, os.path.basename(fc_dir), "fastq")

    if not fastq_dir == fc_dir:# and not os.path.exists(fastq_dir):

        with utils.chdir(basecall_dir):
            lanes = sorted(list(set([f.split("_")[1] for f in
            cl = ["solexa_qseq_to_fastq.py", short_fc_name,
            if postprocess_dir:
                cl += ["-o", fastq_dir]
            if compress_fastq:
                cl += ["--gzip"]

            logger2.debug("Converting qseq to fastq on all lanes.")

    return fastq_dir
Example #9
def generate_align_summary(bam_file, is_paired, sam_ref, sample_name, config, dirs):
    """Run alignment summarizing script to produce a pdf with align details.
    with utils.chdir(dirs["work"]):
        with utils.curdir_tmpdir() as tmp_dir:
            graphs, summary, overrep = _graphs_and_summary(bam_file, sam_ref, is_paired, tmp_dir, config)
        return _generate_pdf(graphs, summary, overrep, bam_file, sample_name, dirs, config)
Example #10
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
Example #11
def _mint_trna_annotation(data):
    use MINTmap to quantify tRNAs
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Example #12
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file,
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    return hsmetric_file
Example #13
def _generate_qseq(bc_dir, config):
    """Generate qseq files from illumina bcl files if not present.

    More recent Illumina updates do not produce qseq files. These can be
    generated from bcl, intensity and filter files with tools from
    the offline base caller OLB.
    if not os.path.exists(os.path.join(bc_dir, "finished.txt")):
        log.info("Generating qseq files at %s" % bc_dir)
        bcl2qseq_log = os.path.join(config["log_dir"], "setupBclToQseq.log")
        cmd = os.path.join(config["program"]["olb"], "bin", "setupBclToQseq.py")
        cl = [cmd, "-L", bcl2qseq_log,"-o", bc_dir, "--in-place", "--overwrite"]
        # in OLB version 1.9, the -i flag changed to intensities instead of input
        version_cl = [cmd, "-v"]
        p = subprocess.Popen(version_cl, stdout=subprocess.PIPE)
        (out, _) = p.communicate()
        olb_version = float(out.strip().split()[-1].rsplit(".", 1)[0])
        if olb_version > 1.8:
            cl += ["-b", bc_dir]
            cl += ["-i", bc_dir, "-p", os.path.split(bc_dir)[0]]
        log.info("Qseq files generated.")
        with utils.chdir(bc_dir):
                processors = config["algorithm"]["num_cores"]
            except KeyError:
                processors = 8
            cl = config["program"].get("olb_make", "make").split() + ["-j", str(processors)]
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        ds_bam = (bam.downsample(bam_file, data, 1e7)
                  if data.get("analysis", "").lower() not in ["standard"]
                  else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Example #15
def coverage(data):
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage"))
    if not bed_file:
        return data
    cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed")
    cleaned_bed = bed.decomment(bed_file, cleaned_bed)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        _calculate_percentiles(os.path.abspath(parse_file), sample)
        data['coverage'] = os.path.abspath(parse_file)
    return data
Example #16
def _run_biodata_upload(args):
    """Manage preparation of biodata on a local machine, uploading to S3 in pieces.
    args = defaults.update_check_args(args, "biodata not uploaded")
    args = install.docker_image_arg(args)
    for gbuild in args.genomes:
        print("Preparing %s" % gbuild)
        if args.prepped:
            for target in ["samtools"] + args.aligners:
                genome.download_prepped_genome(gbuild, {}, target, False, args.prepped)
            print("Downloaded prepped %s to %s. Edit and re-run without --prepped to upload"
                  % (gbuild, args.prepped))
        cl = ["upgrade", "--genomes", gbuild]
        for a in args.aligners:
            cl += ["--aligners", a]
        dmounts = mounts.prepare_system(args.datadir, DOCKER["biodata_dir"])
        manage.run_bcbio_cmd(args.image, dmounts, cl)
        print("Uploading %s" % gbuild)
        gdir = _get_basedir(args.datadir, gbuild)
        basedir, genomedir = os.path.split(gdir)
        assert genomedir == gbuild
        with utils.chdir(basedir):
            all_dirs = sorted(os.listdir(gbuild))
            _upload_biodata(gbuild, "seq", all_dirs)
            for aligner in args.aligners:
                _upload_biodata(gbuild, genome.REMAP_NAMES.get(aligner, aligner), all_dirs)
Example #17
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]),
               "--processes", str(dd.get_num_cores(data)), "--ordered"]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0]
                bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
Example #18
def run_has_samplesheet(fc_dir, config, require_single=True):
    """Checks if there's a suitable SampleSheet.csv present for the run.

    Returns the path to the samplesheet if one is found, None otherwise.
    fc_name, _ = get_flowcell_info(fc_dir)
    sheet_dirs = config.get("samplesheet_directories", [])
    fcid_sheet = {}
    for ss_dir in (s for s in sheet_dirs if os.path.exists(s)):
        with utils.chdir(ss_dir):
            for ss in glob.glob("*.csv"):
                fc_ids = _get_flowcell_id(ss, require_single)
                for fcid in fc_ids:
                    if fcid:
                        fcid_sheet[fcid] = os.path.join(ss_dir, ss)

    # difflib handles human errors while entering data on the SampleSheet.
    # Only one best candidate is returned (if any). 0.85 cutoff allows for
    # maximum of 2 mismatches in fcid

    potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85)
    if len(potential_fcids) > 0 and potential_fcids[0] in fcid_sheet:
        return fcid_sheet[potential_fcids[0]]
        return None
Example #19
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None):
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    bed_file = dd.get_coverage(data)
    if not bed_file or not utils.file_exists(bed_file):
        return []
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            with file_transaction(data, parse_file) as out_tx:
                depth_thresholds = sorted(list(cutoffs | extra_cutoffs))
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs)
    return [os.path.abspath(x) for x in out_files]
Example #20
def run(name, chip_bam, input_bam, genome_build, out_dir, config):
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    # output file name need to have the caller name
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        return out_file
    macs2 = config_utils.get_program("macs2", config)
    options = " ".join(config_utils.get_resources("macs2", config).get("options", ""))
    if genome_build not in HS and options.find("-g") == -1:
        raise ValueError("This %s genome doesn't have a pre-set value."
                          "You can add specific values using resources "
                          "option for macs2 in the YAML file (-g genome_size)."
                          "Check Chip-seq configuration in "
                          "bcbio-nextgen documentation.")

    genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build]
    with utils.chdir(out_dir):
        cmd = _macs2_cmd()
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning("macs2 terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
    return out_file
def _run_kraken(data,ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    stats = out = out_stats = None
    db = data['config']["algorithm"]["kraken"] 
    if db == "minikraken":
        db = os.path.join(_get_data_dir(),"genome","kraken","minikraken")
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report" : "null"}
    if not os.path.exists(os.path.join(kraken_out,"kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]        
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir,"kraken_out")
                out_stats = os.path.join(tx_tmp_dir,"kraken_stats")
                cl = (" ").join([config_utils.get_program("kraken", data["config"]),
                      "--out", out, files[0]," 2>",out_stats])
                do.run(cl,"kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out,db,data)
    return metrics
Example #22
def _start_processing(dname, sample_file, config):
    """Initiate processing: on a remote server or locally on a cluster.
    to_remote = _remap_dirname(dname, os.path.join(utils.get_in(config, ("process", "dir")), os.path.basename(dname)))
    args = {
        "work_dir": to_remote(os.path.join(dname, "analysis")),
        "run_config": to_remote(sample_file),
        "fc_dir": to_remote(dname),
    # call a remote server
    if utils.get_in(config, ("process", "server")):
        print "%s/run?args=%s" % (utils.get_in(config, ("process", "server")), json.dumps(args))
        requests.get(url="%s/run" % utils.get_in(config, ("process", "server")), params={"args": json.dumps(args)})
    # submit to a cluster scheduler
    elif "submit_cmd" in config["process"] and "bcbio_batch" in config["process"]:
        with utils.chdir(utils.safe_makedir(args["work_dir"])):
            batch_script = "submit_bcbio.sh"
            with open(batch_script, "w") as out_handle:
                    config["process"]["bcbio_batch"].format(fcdir=args["fc_dir"], run_config=args["run_config"])
            submit_cmd = utils.get_in(config, ("process", "submit_cmd"))
            subprocess.check_call(submit_cmd.format(batch_script=batch_script), shell=True)
        raise ValueError("Unexpected processing approach: %s" % config["process"])
Example #23
def _upgrade_snpeff_data(galaxy_dir, args, remotes):
    """Install or upgrade snpEff databases, localized to reference directory.
    for dbkey, ref_file in genome.get_builds(galaxy_dir):
        resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
        with open(resource_file) as in_handle:
            resources = yaml.load(in_handle)
        snpeff_db, snpeff_base_dir = effects.get_db(ref_file, resources)
        if snpeff_db:
            snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db)
            if not os.path.exists(snpeff_db_dir):
                print("Installing snpEff database %s in %s" % (snpeff_db, snpeff_base_dir))
                tooldir = args.tooldir or get_defaults()["tooldir"]
                config = {"resources": {"snpeff": {"jvm_opts": ["-Xms500m", "-Xmx1g"],
                                                   "dir": os.path.join(tooldir, "share", "java", "snpeff")}}}
                raw_version = programs.java_versioner("snpeff", "snpEff",
                                                      stdout_flag="snpEff version SnpEff")(config)
                snpeff_version = "".join([x for x in raw_version
                                          if x in set(string.digits + ".")]).replace(".", "_")
                dl_url = remotes["snpeff_dl_url"].format(snpeff_ver=snpeff_version, genome=snpeff_db)
                dl_file = os.path.basename(dl_url)
                with utils.chdir(snpeff_base_dir):
                    subprocess.check_call(["wget", "-c", "-O", dl_file, dl_url])
                    subprocess.check_call(["unzip", dl_file])
                dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db)
                os.rename(dl_dir, snpeff_db_dir)
                os.rmdir(os.path.join(snpeff_base_dir, "data"))
Example #24
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config):
    """Split a fastq file into multiplex pieces using barcode details.
    if not multiplex:
        return [("", "", fastq1, fastq2)]
    bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name)
    nomatch_file = "%s_unmatched_1_fastq.txt" % base_name
    metrics_file = "%s_bc.metrics" % base_name
    out_files = []
    for info in multiplex:
        fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" %
                             (base_name, info["barcode_id"], x))
        bc_file1 = fq_fname("1")
        bc_file2 = fq_fname("2") if fastq2 else None
        out_files.append((info["barcode_id"], info["name"], bc_file1, bc_file2))
    with utils.chdir(bc_dir):
        if not os.path.exists(nomatch_file) and not os.path.exists(metrics_file):
            tag_file = _make_tag_file(multiplex)
            cl = [config["program"]["barcode"], tag_file,
                  "%s_--b--_--r--_fastq.txt" % base_name,
            if fastq2:
            cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"])
            cl.append("--metrics=%s" % metrics_file)
            if int(config["algorithm"]["bc_read"]) == 2:
            if int(config["algorithm"]["bc_position"]) == 5:
            if config["algorithm"].get("bc_allow_indels", True) is False:
            with utils.file_transaction(out_files + [nomatch_file, metrics_file]):
    out_files = [(b, n, f1, f2) for (b, n, f1, f2) in out_files if os.path.exists(f1)]
    return out_files
Example #25
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Example #26
def main(org_build, gtf_file, genome_fasta, genome_dir, cores, args):
    genome_dir = genome_dir if genome_dir else os.curdir
    build_dir = os.path.abspath(os.path.join(genome_dir, org_build))
    work_dir = os.path.join(build_dir, "tmpcbl")
    ens_version = supported_oldbuilds.get(org_build, ensembl_release)
    out_dir = os.path.join(build_dir,
                           "rnaseq-%s_%s" % (datetime.datetime.now().strftime("%Y-%m-%d"), ens_version))
    tophat_dir = os.path.join(out_dir, "tophat")
    gtf_file = os.path.abspath(gtf_file) if gtf_file else gtf_file

    if genome_fasta:
        genome_fasta = os.path.abspath(genome_fasta)
        work_fasta = os.path.join(work_dir, os.path.basename(genome_fasta))
        if not os.path.exists(work_fasta):
            shutil.copy(genome_fasta, work_fasta)
        genome_fasta = work_fasta

    with chdir(work_dir):
        if not genome_fasta:
            genome_fasta = get_genome_fasta(org_build)
        if not gtf_file:
            build = build_info[org_build]
            gtf_file = prepare_tx_gff(build, org_build)
            work_gtf = os.path.join(work_dir, "ref-transcripts.gtf")
            if not os.path.exists(work_gtf):
                shutil.copy(gtf_file, work_gtf)
            gtf_file = work_gtf
        gtf_file = clean_gtf(gtf_file, genome_fasta)
        db = _get_gtf_db(gtf_file)
        gtf_file = db_to_gtf(db, gtf_file)
        mask_gff = prepare_mask_gtf(gtf_file)
        rrna_gtf = prepare_rrna_gtf(gtf_file)
        if file_exists(rrna_gtf):
            gtf_to_interval(rrna_gtf, genome_fasta)
        if args.tophat:
            prepare_tophat_index(gtf_file, org_build, genome_fasta)
        transcriptome_fasta = make_transcriptome_fasta(gtf_file, genome_fasta)
        if args.kallisto:
            prepare_kallisto_index(transcriptome_fasta, org_build)
        cleanup(work_dir, out_dir, org_build)
        rnaseq_dir = os.path.join(build_dir, "rnaseq")
        if os.path.exists(rnaseq_dir):
            if os.path.islink(rnaseq_dir):
        os.symlink(out_dir, rnaseq_dir)

    tar_dirs = [os.path.relpath(out_dir)]
    tarball = create_tarball(tar_dirs, org_build)
Example #27
def _files_to_copy(directory):
    """Retrieve files that should be remotely copied.
    with utils.chdir(directory):
        image_redo_files = reduce(operator.add,
                                   ["RunInfo.xml", "runParameters.xml"]])
        qseqs = reduce(operator.add,
        reports = reduce(operator.add,
                      ["Data/Intensities/BaseCalls/Plots", "Data/reports",
                       "Data/Status.htm", "Data/Status_Files", "InterOp"]])
        run_info = reduce(operator.add,
        logs = reduce(operator.add, [["Logs", "Recipe", "Diag", "Data/RTALogs", "Data/Log.txt"]])
        fastq = ["Data/Intensities/BaseCalls/fastq"]
    return (sorted(image_redo_files + logs + reports + run_info + qseqs),
            sorted(reports + fastq + run_info))
Example #28
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file):
                do.run(cmd.format(**locals()), "Running mirdeep2.")
                logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Example #29
def _upgrade_snpeff_data(galaxy_dir, args, remotes):
    """Install or upgrade snpEff databases, localized to reference directory.
    snpeff_version = effects.snpeff_version(args)
    if not snpeff_version:
    for dbkey, ref_file in genome.get_builds(galaxy_dir):
        resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
        if os.path.exists(resource_file):
            with open(resource_file) as in_handle:
                resources = yaml.load(in_handle)
            snpeff_db, snpeff_base_dir = effects.get_db({"genome_resources": resources,
                                                         "reference": {"fasta": {"base": ref_file}}})
            if snpeff_db:
                snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db)
                if os.path.exists(snpeff_db_dir) and _is_old_database(snpeff_db_dir, args):
                if not os.path.exists(snpeff_db_dir):
                    print("Installing snpEff database %s in %s" % (snpeff_db, snpeff_base_dir))
                    dl_url = remotes["snpeff_dl_url"].format(
                        snpeff_ver=snpeff_version.replace(".", "_"),
                    dl_file = os.path.basename(dl_url)
                    with utils.chdir(snpeff_base_dir):
                        subprocess.check_call(["wget", "-c", "-O", dl_file, dl_url])
                        subprocess.check_call(["unzip", dl_file])
                    dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db)
                    shutil.move(dl_dir, snpeff_db_dir)
                    os.rmdir(os.path.join(snpeff_base_dir, "data"))
Example #30
def unpack_tarballs(xs, data, use_subdir=True):
    """Unpack workflow tarballs into ready to use directories.
    if isinstance(xs, dict):
        for k, v in xs.items():
            xs[k] = unpack_tarballs(v, data, use_subdir)
    elif isinstance(xs, (list, tuple)):
        xs = [unpack_tarballs(x, data, use_subdir) for x in xs]
    elif isinstance(xs, six.string_types):
        if os.path.isfile(xs.encode("utf-8", "ignore")) and xs.endswith("-wf.tar.gz"):
            if use_subdir:
                tarball_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "wf-inputs"))
                tarball_dir = dd.get_work_dir(data)
            out_dir = os.path.join(tarball_dir,
                                   os.path.basename(xs).replace("-wf.tar.gz", "").replace("--", os.path.sep))
            if not os.path.exists(out_dir):
                with utils.chdir(tarball_dir):
                    with tarfile.open(xs, "r:gz") as tar:
            assert os.path.exists(out_dir), out_dir
            # Default to representing output directory
            xs = out_dir
            # Look for aligner indices
            for fname in os.listdir(out_dir):
                if fname.endswith(DIR_TARGETS):
                    xs = os.path.join(out_dir, fname)
                elif fname.endswith(BASENAME_TARGETS):
                    base = os.path.join(out_dir, utils.splitext_plus(os.path.basename(fname))[0])
                    xs = glob.glob("%s*" % base)
    return xs
def analyze_locally(dname, post_config_file, fastq_dir):
    """Run analysis directly on the local machine.
    assert fastq_dir is not None
    post_config = load_config(post_config_file)
    analysis_dir = os.path.join(fastq_dir, os.pardir, "analysis")
    with utils.chdir(analysis_dir):
        if post_config["algorithm"]["num_cores"] == "messaging":
            prog = post_config["analysis"]["distributed_process_program"]
            prog = post_config["analysis"]["process_program"]
        cl = [prog, post_config_file, dname]
        run_yaml = os.path.join(dname, "run_info.yaml")
        if os.path.exists(run_yaml):
Example #32
def main(cores=1):
    start_dir = os.getcwd()
    work_dir = utils.safe_makedir("/scratch/square")
    priorities = set(["1", "2"])
    list_file = get_input_list(start_dir, priorities)
    # Ensure input CRAMs are indexed; gets IO bound quickly so limit cores
    cram_cores = min(int(cores), 6)
    for cindex in joblib.Parallel(cram_cores)(joblib.delayed(index_cram)(x)
                                              for x in find_crams(list_file)):
        print cindex
    with utils.chdir(work_dir):
        out_file = run_squaring(list_file, name, ref_file, cores)
    for ext in ["", ".tbi"]:
        new_file = os.path.join(start_dir, os.path.basename(out_file) + ext)
        if not utils.file_exists(new_file):
            shutil.copy(out_file + ext, new_file)
Example #33
def main(config_file, queues=None, task_module=None, base_dir=None):
    if base_dir is None:
        base_dir = os.getcwd()
    if task_module is None:
        task_module = "bcbio.distributed.tasks"
    config = load_config(config_file)
    if config.get("log_dir", None) is None:
        config["log_dir"] = os.path.join(base_dir, "log")
    logger.info("Starting distributed worker process: {0}".format(queues if queues else ""))
    with utils.chdir(base_dir):
        with utils.curdir_tmpdir() as work_dir:
            dirs = {"work": work_dir, "config": os.path.dirname(config_file)}
            with create_celeryconfig(task_module, dirs, config,
                run_celeryd(work_dir, queues)
Example #34
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
    if antibody:
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)
Example #35
def _run_cwltool(args):
    """Run with cwltool -- reference implementation.
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cwltool_work"))
    tmp_dir = utils.safe_makedir(os.path.join(work_dir, "tmpcwl"))
    os.environ["TMPDIR"] = tmp_dir
    flags = ["--tmpdir-prefix", tmp_dir, "--tmp-outdir-prefix", tmp_dir]
    if args.no_container:
        flags += [
            "--no-container", "--preserve-environment", "PATH",
            "--preserve-environment", "HOME"
    cmd = ["cwltool"] + flags + args.toolargs + ["--", main_file, json_file]
    with utils.chdir(work_dir):
        _run_tool(cmd, not args.no_container, work_dir)
Example #36
def _generate_fastq(fc_dir, config):
    """Generate fastq files for the current flowcell.
    fc_name, fc_date = get_flowcell_info(fc_dir)
    short_fc_name = "%s_%s" % (fc_date, fc_name)
    fastq_dir = get_fastq_dir(fc_dir)
    basecall_dir = os.path.split(fastq_dir)[0]
    if not fastq_dir == fc_dir and not os.path.exists(fastq_dir):
        log.info("Generating fastq files for %s" % fc_dir)
        with utils.chdir(basecall_dir):
            lanes = sorted(
                list(set([f.split("_")[1] for f in glob.glob("*qseq.txt")])))
            cl = ["solexa_qseq_to_fastq.py", short_fc_name, ",".join(lanes)]
            log.info("Converting qseq to fastq on all lanes.")
            log.info("Qseq to fastq conversion completed.")
    return fastq_dir
Example #37
def _directory_tarball(dirname):
    """Create a tarball of a complex directory, avoiding complex secondaryFiles.

    Complex secondary files do not work on multiple platforms and are not portable
    to WDL, so for now we create a tarball that workers will unpack.
    assert os.path.isdir(dirname)
    base_dir, tarball_dir = os.path.split(dirname)
    while base_dir and not os.path.exists(os.path.join(base_dir, "seq")):
        base_dir, extra_tarball = os.path.split(base_dir)
        tarball_dir = os.path.join(extra_tarball, tarball_dir)
    tarball = os.path.join(base_dir, "%s-wf.tar.gz" % (tarball_dir.replace(os.path.sep, "--")))
    if not utils.file_exists(tarball):
        with utils.chdir(base_dir):
            with tarfile.open(tarball, "w:gz") as tar:
    return tarball
Example #38
def priority_coverage(data):
    bed_file = dd.get_svprioritize(data)
    if not bed_file or not file_exists(bed_file):
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    batch_size = max_command_length() / AVERAGE_REGION_STRING_LENGTH

    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_depth.bed")
    if file_exists(out_file):
        data['priority_coverage'] = os.path.abspath(out_file)
        return data
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        logger.debug("Calculating priority coverage for %s" % sample)
        region_bed = pybedtools.BedTool(bed_file)
        with file_transaction(out_file) as tx_out_file:
            lcount = 0
            for chunk in robust_partition_all(batch_size, region_bed):
                coord_batch = []
                line_batch = ""
                for line in chunk:
                    lcount += 1
                    chrom = line.chrom
                    start = max(line.start, 0)
                    end = line.end
                    coords = "%s:%s-%s" % (chrom, start, end)
                    line_batch += "%s\t%s\t%s\n" % (chrom, start, end)
                if not coord_batch:
                region_file = pybedtools.BedTool(line_batch,
                coord_string = " ".join(coord_batch)
                awk_string = r"""'BEGIN {OFS="\t"} {print $1,$2+$5,$2+$5,$4,$6"\t%s"}'""" % sample
                samtools = config_utils.get_program("samtools", data["config"])
                bedtools = config_utils.get_program("bedtools", data["config"])
                cmd = (
                    "{samtools} view -b {in_bam} {coord_string} | "
                    "{bedtools} coverage -sorted -d -a {region_file} -b - | "
                    "awk {awk_string} >> {tx_out_file}")
        data['priority_coverage'] = os.path.abspath(out_file)
    return data
Example #39
def process(args):
    """Run the function in args.name given arguments in args.argfile.
    # Set environment to standard to use periods for decimals and avoid localization
    os.environ["LC_ALL"] = "C"
    os.environ["LC"] = "C"
    os.environ["LANG"] = "C"
        fn = getattr(multitasks, args.name)
    except AttributeError:
        raise AttributeError("Did not find exposed function in bcbio.distributed.multitasks named '%s'" % args.name)
    if args.moreargs or args.raw:
        fnargs = [args.argfile] + args.moreargs
        work_dir = None
        argfile = None
        with open(args.argfile) as in_handle:
            fnargs = yaml.safe_load(in_handle)
        work_dir = os.path.dirname(args.argfile)
        fnargs = config_utils.merge_resources(fnargs)
        argfile = args.outfile if args.outfile else "%s-out%s" % os.path.splitext(args.argfile)
    if not work_dir:
        work_dir = os.getcwd()
    if len(fnargs) > 0 and fnargs[0] == "cwl":
        fnargs, parallel, out_keys = _world_from_cwl(args.name, fnargs[1:], work_dir)
        # Can remove this awkward Docker merge when we do not need custom GATK3 installs
        fnargs = config_utils.merge_resources(fnargs)
        argfile = os.path.join(work_dir, "cwl.output.json")
        parallel, out_keys = None, {}
    with utils.chdir(work_dir):
        with contextlib.closing(log.setup_local_logging(parallel={"wrapper": "runfn"})):
                out = fn(fnargs)
    if argfile:
            _write_out_argfile(argfile, out, fnargs, parallel, out_keys, work_dir)
        if argfile.endswith(".json"):
            _write_wdl_outputs(argfile, out_keys)
Example #40
def install_srna(species, gtf):
    out_file = os.path.join(SRNASEQ_DIR, "srna-transcripts.gtf")
    if not os.path.exists(out_file):
        shutil.copyfile(gtf, out_file)
        from seqcluster import install
    except ImportError:
        raise ImportError("install seqcluster first, please.")
    with chdir(SRNASEQ_DIR):
        hairpin, miRNA = install._install_mirbase()
        cmd = ("grep -A 2 {species} {hairpin} | grep -v '\-\-$' | tr U T  > hairpin.fa")
        do.run(cmd.format(**locals()), "set precursor.")
        cmd = ("grep -A 1 {species} {miRNA} > miRNA.str")
        do.run(cmd.format(**locals()), "set miRNA.")
    return out_file
Example #41
def rmarkdown_draft(filename, template, package):
    create a draft rmarkdown file from an installed template
    if file_exists(filename):
        return filename
    draft_template = Template(
        'rmarkdown::draft("$filename", template="$template", package="$package", edit=FALSE)'
    draft_string = draft_template.substitute(
        filename=filename, template=template, package=package)
    report_dir = os.path.dirname(filename)
    rcmd = Rscript_cmd()
    with chdir(report_dir):
        do.run([rcmd, "--vanilla", "-e", draft_string], "Creating bcbioRNASeq quality control template.")
        do.run(["sed", "-i", "s/YYYY-MM-DD\///g", filename], "Editing bcbioRNAseq quality control template.")
    return filename
Example #42
def _convert_fastq(srafn, outdir, single=False):
    "convert sra to fastq"
    cmd = "fastq-dump --split-files --gzip {srafn}"
    sraid = os.path.basename(utils.splitext_plus(srafn)[0])
    if not single:
        out_file = [
            os.path.join(outdir, "%s_1.fastq.gz" % sraid),
            os.path.join(outdir, "%s_2.fastq.gz" % sraid)
        if not utils.file_exists(out_file[0]):
            with utils.chdir(outdir):
                do.run(cmd.format(**locals()), "Covert to fastq %s" % sraid)
        if not utils.file_exists(out_file[0]):
            raise IOError("SRA %s didn't convert, something happened." % srafn)
        return [out for out in out_file if utils.file_exists(out)]
        raise ValueError("Not supported single-end sra samples for now.")
Example #43
def _bcbio_variation_ensemble(vrn_files, out_file, ref_file, config_file,
                              base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    vrn_files = [_handle_somatic_ensemble(v, data) for v in vrn_files]
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar(
        config_utils.get_program("bcbio_variation", data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + [
        "-jar", bv_jar, "variant-ensemble", config_file, ref_file, out_file
    ] + vrn_files
    with utils.chdir(base_dir):
        do.run(cmd, "Ensemble calling: %s" % os.path.basename(base_dir))
Example #44
def _download_ref(url, ref_dir):
    #Lifted from Brad Chapman
    dl_file = os.path.basename(url)
    ref_file = None
    for supported_ext, extract_cmd in [(".gz", "gunzip"),
                                       (".tgz", ("tar", "zxvf"))]:
        if dl_file.endswith(supported_ext):
            ref_file = os.path.join(ref_dir, dl_file[:-len(supported_ext)])
    assert ref_file is not None, url
    if not os.path.exists(ref_file):
        with utils.chdir(ref_dir):
            cl = ["wget", url]
            cl = list(flatten([extract_cmd, dl_file]))
    return ref_file
Example #45
def _run_funnel(args):
    """Run funnel TES server with rabix bunny for CWL.
    host = "localhost"
    port = "8088"
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "funnel_work"))
    log_file = os.path.join(work_dir, "%s-funnel.log" % project_name)
    # Create bunny configuration directory with TES backend
    orig_config_dir = os.path.join(
        os.path.dirname(os.path.realpath(utils.which("rabix"))), "config")
    work_config_dir = utils.safe_makedir(os.path.join(work_dir,
    for fname in os.listdir(orig_config_dir):
        if fname == "core.properties":
            with open(os.path.join(orig_config_dir, fname)) as in_handle:
                with open(os.path.join(work_config_dir, fname),
                          "w") as out_handle:
                    for line in in_handle:
                        if line.startswith("backend.embedded.types"):
                            line = "backend.embedded.types=TES\n"
            shutil.copy(os.path.join(orig_config_dir, fname),
                        os.path.join(work_config_dir, fname))
    flags = [
        "-c", work_config_dir,
        "-tes-url=http://%s:%s" % (host, port),
        "-tes-storage=%s" % work_dir
    if args.no_container:
        flags += ["--no-container"]
    cmd = ["rabix"] + flags + [main_file, json_file]
    funnelp = subprocess.Popen([
        "funnel", "server", "run", "--Server.HostName", host,
        "--Server.HTTPPort", port, "--LocalStorage.AllowedDirs", work_dir,
        os.path.join(work_dir, "funnel-work")
        with utils.chdir(work_dir):
            _run_tool(cmd, not args.no_container, work_dir, log_file)
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions,
                                                   "variant_regions", file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
Example #47
def _run_bunny(args):
    """Run CWL with rabix bunny.
    main_file, json_file, project_name = _get_main_and_json(args.directory)
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "bunny_work"))
    flags = ["-b", work_dir]
    log_file = os.path.join(work_dir, "%s-bunny.log" % project_name)
    if os.path.exists(work_dir):
        caches = [os.path.join(work_dir, d) for d in os.listdir(work_dir)
                  if os.path.isdir(os.path.join(work_dir, d))]
        if caches:
            flags += ["--cache-dir", max(caches, key=os.path.getmtime)]
    if args.no_container:
        flags += ["--no-container"]
    cmd = ["rabix"] + flags + [main_file, json_file]
    with utils.chdir(work_dir):
        _run_tool(cmd, not args.no_container, work_dir, log_file)
Example #48
def install_srna(species, gtf):
    out_file = os.path.join(SRNASEQ_DIR, "srna-transcripts.gtf")
    if gtf:
        if not file_exists(out_file):
            shutil.copyfile(gtf, out_file)
        from seqcluster import install
    except ImportError:
        raise ImportError("install seqcluster first, please.")
    with chdir(SRNASEQ_DIR):
        hairpin, miRNA = install._install_mirbase()
        cmd = ("cat %s |  awk '{if ($0~/>%s/){name=$0; print name} else if ($0~/^>/){name=0};if (name!=0 && $0!~/^>/){print $0;}}' | sed 's/U/T/g'  > hairpin.fa")
        do.run(cmd % (hairpin, species), "set precursor.")
        cmd = ("grep -A 1 {species} {miRNA} > miRNA.str")
        do.run(cmd.format(**locals()), "set miRNA.")
    return out_file
Example #49
def prep_vep_cache(dbkey, ref_file, tooldir=None, config=None):
    """Ensure correct installation of VEP cache file.
    if config is None: config = {}
    resource_file = os.path.join(os.path.dirname(ref_file), "%s-resources.yaml" % dbkey)
    if os.path.exists(resource_file):
        with open(resource_file) as in_handle:
            resources = yaml.load(in_handle)
        ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
        symlink_dir = _special_dbkey_maps(dbkey, ref_file)
        if ensembl_name and ensembl_name.find("_vep_") == -1:
            raise ValueError("%s has ensembl an incorrect value."
                             "It should have _vep_ in the name."
                             "Remove line or fix the name to avoid error.")
        if symlink_dir and ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            return symlink_dir, species
        elif ensembl_name:
            species, vepv = ensembl_name.split("_vep_")
            vep_dir = utils.safe_makedir(os.path.normpath(os.path.join(
                os.path.dirname(os.path.dirname(ref_file)), "vep")))
            out_dir = os.path.join(vep_dir, species, vepv)
            if not os.path.exists(out_dir):
                tmp_dir = utils.safe_makedir(os.path.join(vep_dir, species, "txtmp"))
                eversion = vepv.split("_")[0]
                url = "ftp://ftp.ensembl.org/pub/release-%s/variation/VEP/%s.tar.gz" % (eversion, ensembl_name)
                with utils.chdir(tmp_dir):
                    subprocess.check_call(["wget", "--no-check-certificate", "-c", url])
                vep_path = "%s/bin/" % tooldir if tooldir else ""
                perl_exports = utils.get_perl_exports()
                cmd = ["%svep_install" % vep_path, "-a", "c", "-s", ensembl_name,
                       "-c", vep_dir, "-u", tmp_dir, "--NO_UPDATE", "--VERSION", eversion]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Prepare VEP directory for %s" % ensembl_name)
                cmd = ["%svep_convert_cache" % vep_path, "--species", species, "--version", vepv,
                       "--dir", vep_dir, "--force_overwrite", "--remove"]
                do.run("%s && %s" % (perl_exports, " ".join(cmd)), "Convert VEP cache to tabix %s" % ensembl_name)
                for tmp_fname in os.listdir(tmp_dir):
                    os.remove(os.path.join(tmp_dir, tmp_fname))
            tmp_dir = os.path.join(vep_dir, "tmp")
            if os.path.exists(tmp_dir):
            return vep_dir, species
    return None, None
Example #50
def run(_, data, out_dir):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (dd.get_sample_name(data), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
    # ratio = bam.get_aligned_reads(bam_file, data)
    out = out_stats = None
    db = tz.get_in(["config", "algorithm", "kraken"], data)
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(install._get_data_dir(), "genomes", "kraken",

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(out_dir, "kraken_out")):
        work_dir = os.path.dirname(out_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files_orig"][0] if dd.get_save_diskspace(
            data) else data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fastq files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--output {out} --fastq-input /dev/stdin  2> {out_stats}"
                do.run(cl, "kraken: %s" % dd.get_sample_name(data))
                if os.path.exists(out_dir):
                shutil.move(tx_tmp_dir, out_dir)
    metrics = _parse_kraken_output(out_dir, db, data)
    return metrics
Example #51
def _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items):
    """Run lumpy-sv using smoove.
    batch = sshared.get_cur_batch(items)
    ext = "-%s-svs" % batch if batch else "-svs"
    name = "%s%s" % (dd.get_sample_name(items[0]), ext)
    out_file = os.path.join(work_dir, "%s-smoove.genotyped.vcf.gz" % name)
    sv_exclude_bed = sshared.prepare_exclude_file(items, out_file)
    old_out_file = os.path.join(work_dir, "%s%s-prep.vcf.gz"
                                % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext))
    if utils.file_exists(old_out_file):
        return old_out_file, sv_exclude_bed
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cores = dd.get_num_cores(items[0])
            out_dir = os.path.dirname(tx_out_file)
            ref_file = dd.get_ref_file(items[0])
            full_bams = " ".join(_prepare_smoove_bams(full_bams, sr_bams, disc_bams, items,
            std_excludes = ["~^GL", "~^HLA", "~_random", "~^chrUn", "~alt", "~decoy"]
            def _is_std_exclude(n):
                clean_excludes = [x.replace("~", "").replace("^", "") for x in std_excludes]
                return any([n.startswith(x) or n.endswith(x) for x in clean_excludes])
            exclude_chrs = [c.name for c in ref.file_contigs(ref_file)
                            if not chromhacks.is_nonalt(c.name) and not _is_std_exclude(c.name)]
            exclude_chrs = "--excludechroms '%s'" % ",".join(std_excludes + exclude_chrs)
            exclude_bed = ("--exclude %s" % sv_exclude_bed) if utils.file_exists(sv_exclude_bed) else ""
            tempdir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tempdir} && "
                   "smoove call --processes {cores} --genotype --removepr --fasta {ref_file} "
                   "--name {name} --outdir {out_dir} "
                   "{exclude_bed} {exclude_chrs} {full_bams}")
            with utils.chdir(tempdir):
                    do.run(cmd.format(**locals()), "smoove lumpy calling", items[0])
                except subprocess.CalledProcessError as msg:
                    if _allowed_errors(str(msg)):
                        vcfutils.write_empty_vcf(tx_out_file, config=items[0]["config"],
                                                samples=[dd.get_sample_name(d) for d in items])
    vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file, sv_exclude_bed
Example #52
def _run_kraken(data, ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    # logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]), ratio))
    logger.info("Running kraken to determine contaminant: %s" %
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    out = out_stats = None
    db = data['config']["algorithm"]["kraken"]
    kraken_cmd = config_utils.get_program("kraken", data["config"])
    if db == "minikraken":
        db = os.path.join(_get_data_dir(), "genomes", "kraken", "minikraken")

    if not os.path.exists(db):
        logger.info("kraken: no database found %s, skipping" % db)
        return {"kraken_report": "null"}

    if not os.path.exists(os.path.join(kraken_out, "kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        fn_file = data["files"][0]
        if fn_file.endswith("bam"):
            logger.info("kraken: need fasta files as input")
            return {"kraken_report": "null"}
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir, "kraken_out")
                out_stats = os.path.join(tx_tmp_dir, "kraken_stats")
                cat = "zcat" if fn_file.endswith(".gz") else "cat"
                cl = ("{cat} {fn_file} | {kraken_cmd} --db {db} --quick "
                      "--preload --min-hits 2 "
                      "--threads {num_cores} "
                      "--out {out} --fastq-input /dev/stdin  2> {out_stats}"
                do.run(cl, "kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out, db, data)
    return metrics
Example #53
def _upgrade_snpeff_data(galaxy_dir, args, remotes):
    """Install or upgrade snpEff databases, localized to reference directory.
    snpeff_version = effects.snpeff_version(args)
    if not snpeff_version:
    for dbkey, ref_file in genome.get_builds(galaxy_dir):
        resource_file = os.path.join(os.path.dirname(ref_file),
                                     "%s-resources.yaml" % dbkey)
        if os.path.exists(resource_file):
            with open(resource_file) as in_handle:
                resources = yaml.safe_load(in_handle)
            snpeff_db, snpeff_base_dir = effects.get_db({
                "genome_resources": resources,
                "reference": {
                    "fasta": {
                        "base": ref_file
            if snpeff_db:
                snpeff_db_dir = os.path.join(snpeff_base_dir, snpeff_db)
                if os.path.exists(snpeff_db_dir) and _is_old_database(
                        snpeff_db_dir, args):
                if not os.path.exists(snpeff_db_dir):
                    print("Installing snpEff database %s in %s" %
                          (snpeff_db, snpeff_base_dir))
                    dl_url = remotes["snpeff_dl_url"].format(
                        snpeff_ver=snpeff_version.replace(".", "_"),
                    dl_file = os.path.basename(dl_url)
                    with utils.chdir(snpeff_base_dir):
                            "wget", "--no-check-certificate", "-c", "-O",
                            dl_file, dl_url
                        subprocess.check_call(["unzip", dl_file])
                    dl_dir = os.path.join(snpeff_base_dir, "data", snpeff_db)
                    shutil.move(dl_dir, snpeff_db_dir)
                    os.rmdir(os.path.join(snpeff_base_dir, "data"))
                if args.cwl:
Example #54
def _run_analysis(fc_dir, remote_info, config, config_file):
    """Run local or distributed analysis, wait to finish.
    run_yaml = _get_run_yaml(remote_info, fc_dir, config)
    analysis_dir = os.path.join(config["analysis"].get("base_dir", os.getcwd()),
    if not os.path.exists(analysis_dir):
    with utils.chdir(analysis_dir):
        if config["algorithm"]["num_cores"] == "messaging":
            prog = config["analysis"].get("distributed_process_program",
            prog = config["analysis"]["process_program"]
        cl = [prog, config_file, fc_dir]
        if run_yaml:
    return analysis_dir
Example #55
def sort_by_ref(vcf_file, data):
    """Sort a VCF file by genome reference and position, adding contig information.
    out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0]
    if not utils.file_uptodate(out_file, vcf_file):
        with file_transaction(data, out_file) as tx_out_file:
            header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                for region in ref.file_contigs(dd.get_ref_file(data),
                    out_handle.write("##contig=<ID=%s,length=%s>\n" %
                                     (region.name, region.size))
            cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat"
            cmd = (
                "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | "
                "vt sort -m full -o {tx_out_file} -")
            with utils.chdir(os.path.dirname(tx_out_file)):
                do.run(cmd.format(**locals()), "Sort VCF by reference")
    return bgzip_and_index(out_file, data["config"])
Example #56
def prepare_rsem_reference(gtf, multifasta, build):
    gtf: path to GTF file (must have gene_id and transcript_id)
    multifasta: path to multifasta file
    build: name of organism build (e.g. hg19)
    if not utils.which("rsem-prepare-reference"):
        logger.info("Skipping prepping RSEM reference because "
                    "rsem-prepare-reference could not be found.")
        return None

    command = PREPARE_REFERENCE.format(gtf=gtf,
    with transaction.tx_tmpdir(remove=False) as rsem_genome_dir:
        with utils.chdir(rsem_genome_dir):
            message = "Preparing rsem reference from %s" % gtf
            do.run(command, message)
    return rsem_genome_dir
Example #57
def rnaseq_align_summary(bam_file, sam_ref, sample_name, config, dirs):
    qc_dir = utils.safe_makedir(os.path.join(dirs["work"], "qc"))
    genome_dir = os.path.dirname(os.path.dirname(sam_ref))
    refflat_file = config_utils.get_transcript_refflat(genome_dir)
    rrna_file = config_utils.get_rRNA_interval(genome_dir)
    if not utils.file_exists(rrna_file):
        rrna_file = "null"
    with utils.curdir_tmpdir() as tmp_dir:
        graphs, summary, overrep = \
                _rnaseq_graphs_and_summary(bam_file, sam_ref, refflat_file, rrna_file,
                                           qc_dir, tmp_dir, config)
    with utils.chdir(qc_dir):
        return {
            _generate_pdf(graphs, summary, overrep, bam_file, sample_name,
                          qc_dir, config),
Example #58
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    params = {
        "window_size": 5000,
        "parallel_window_size": 1e5,
        "min": dd.get_coverage_depth_min(data)
    prefix = os.path.join(
            os.path.join(dd.get_work_dir(data), "align",
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = [
            "goleft", "depth", "--q", "1", "--mincov",
            str(params["min"]), "--processes",
            str(dd.get_num_cores(data)), "--ordered"
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(
                    ".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(
                bam.fai_from_bam(dd.get_ref_file(data), bam_file,
                                 bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions,
    return depth_file, final_callable
Example #59
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    # unsure how to estimate from single end data, so go with a reasonable default
    frag_length = 250
    batch_file = umi.convert_to_kallisto(data)
    index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    cmd = ("{kallisto} pseudo --umi "
           "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}")
    with chdir(os.path.dirname(batch_file)):
        with file_transaction(data, quant_dir) as tx_out_dir:
            message = ("Quantifying transcripts with Kallisto.")
            do.run(cmd.format(**locals()), message, None)
    kallisto_table(kallisto_dir, index)
    return quant_dir
Example #60
def main(org_build):
    work_dir = os.path.join(os.getcwd(), org_build, "tmpcbl")
    out_dir = os.path.join(
        os.getcwd(), org_build,
        "rnaseq-%s" % datetime.datetime.now().strftime("%Y-%m-%d"))
    tophat_dir = os.path.join(out_dir, "tophat")
    with chdir(work_dir):
        build = build_info[org_build]
        tx_gff = prepare_tx_gff(build, org_build)
        mask_gff = prepare_mask_gtf(tx_gff)
        rrna_gtf = prepare_rrna_gtf(tx_gff)
        gtf_to_interval(rrna_gtf, org_build)
        make_miso_events(tx_gff, org_build)
        prepare_tophat_index(tx_gff, org_build)
        cleanup(work_dir, out_dir)
    tar_dirs = [out_dir]
    upload_to_s3(tar_dirs, org_build)