Exemple #1
0
def _bowtie_for_innerdist(start, fastq_file, pair_file, ref_file, out_base,
                          out_dir, data, remove_workdir=False):
    work_dir = os.path.join(out_dir, "innerdist_estimate")
    if os.path.exists(work_dir):
        shutil.rmtree(work_dir)
    safe_makedir(work_dir)
    extra_args = ["-s", str(start), "-u", "250000"]
    ref_file, bowtie_runner = _determine_aligner_and_reference(ref_file, data["config"])
    out_sam = bowtie_runner.align(fastq_file, pair_file, ref_file, {"lane": out_base},
                                  work_dir, data, extra_args)
    dists = []
    with closing(pysam.Samfile(out_sam)) as work_sam:
        for read in work_sam:
            if read.is_proper_pair and read.is_read1:
                dists.append(abs(read.isize) - 2 * read.rlen)
    if dists:
        median = float(numpy.median(dists))
        deviations = []
        for d in dists:
            deviations.append(abs(d - median))
        # this is the median absolute deviation estimator of the
        # standard deviation
        mad = 1.4826 * float(numpy.median(deviations))
        return int(median), int(mad)
    else:
        return None, None
def _run_fastqc(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/QC pipeline.
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_bam = (bam.downsample(bam_file, data, 1e7)
                  if data.get("analysis", "").lower() not in ["standard"]
                  else None)
        bam_file = ds_bam if ds_bam else bam_file
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [config_utils.get_program("fastqc", data["config"]),
                      "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file]
                do.run(cl, "FastQC: %s" % data["name"][-1])
                fastqc_outdir = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0])
                if os.path.exists("%s.zip" % fastqc_outdir):
                    os.remove("%s.zip" % fastqc_outdir)
                if not os.path.exists(sentry_file):
                    if os.path.exists(fastqc_out):
                        shutil.rmtree(fastqc_out)
                    shutil.move(fastqc_outdir, fastqc_out)
        if ds_bam and os.path.exists(ds_bam):
            os.remove(ds_bam)
    parser = FastQCParser(fastqc_out)
    stats = parser.get_fastqc_summary()
    return stats
Exemple #3
0
def organize(dirs, config, run_info_yaml):
    """Organize run information from a passed YAML file or the Galaxy API.

    Creates the high level structure used for subsequent processing.
    """
    logger.info("Using input YAML configuration: %s" % run_info_yaml)
    assert run_info_yaml and os.path.exists(run_info_yaml), \
        "Did not find input sample YAML file: %s" % run_info_yaml
    run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config)
    out = []
    for item in run_details:
        # add algorithm details to configuration, avoid double specification
        item["config"] = config_utils.update_w_custom(config, item)
        item.pop("algorithm", None)
        item["dirs"] = dirs
        if "name" not in item:
            item["name"] = ["", item["description"]]
        elif isinstance(item["name"], basestring):
            description = "%s-%s" % (item["name"], clean_name(item["description"]))
            item["name"] = [item["name"], description]
            item["description"] = description
        item = add_reference_resources(item)
        # Create temporary directories and make absolute
        if utils.get_in(item, ("config", "resources", "tmp", "dir")):
            utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir")))
            item["config"]["resources"]["tmp"] = genome.abs_file_paths(
                utils.get_in(item, ("config", "resources", "tmp")))
        out.append(item)
    return out
def samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file):
    """Deduplicate and sort with samblaster, produces split read and discordant pair files.
    """
    samblaster = config_utils.get_program("samblaster", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])
    cores, mem = _get_cores_memory(data, downscale=3)
    tmp_prefix = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0]
    for ext in ["spl", "disc", "full"]:
        utils.safe_makedir("%s-%s" % (tmp_prefix, ext))
    if data.get("align_split"):
        full_tobam_cmd = _nosort_tobam_cmd()
    else:
        full_tobam_cmd = ("samtools view -b -u - | "
                          "sambamba sort -t {cores} -m {mem} "
                          "--tmpdir {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    tobam_cmd = ("{samtools} sort -@ {cores} -m {mem} "
                 "-T {tmp_prefix}-{dext} -o {out_file} /dev/stdin")
    # samblaster 0.1.22 and better require the -M flag for compatibility with bwa-mem
    # https://github.com/GregoryFaust/samblaster/releases/tag/v.0.1.22
    if LooseVersion(programs.get_version_manifest("samblaster", data=data, required=True)) >= LooseVersion("0.1.22"):
        opts = "-M"
    else:
        opts = ""
    splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, dext="spl", **locals())
    discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, dext="disc", **locals())
    dedup_cmd = full_tobam_cmd.format(out_file=tx_out_file, dext="full", **locals())
    cmd = ("{samblaster} {opts} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) "
           "| {dedup_cmd}")
    return cmd.format(**locals())
def _run_kraken(data,ratio):
    """Run kraken, generating report in specified directory and parsing metrics.
       Using only first paired reads.
    """
    logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio))
    logger.info("Running kraken to determine contaminant: %s" % str(data["name"]))
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    kraken_out = os.path.join(qc_dir, "kraken")
    stats = out = out_stats = None
    db = data['config']["algorithm"]["kraken"] 
    if db == "minikraken":
        db = os.path.join(_get_data_dir(),"genome","kraken","minikraken")
    else:
        if not os.path.exists(db):
            logger.info("kraken: no database found %s, skipping" % db)
            return {"kraken_report" : "null"}
    if not os.path.exists(os.path.join(kraken_out,"kraken_out")):
        work_dir = os.path.dirname(kraken_out)
        utils.safe_makedir(work_dir)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        files = data["files"]        
        with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                out = os.path.join(tx_tmp_dir,"kraken_out")
                out_stats = os.path.join(tx_tmp_dir,"kraken_stats")
                cl = (" ").join([config_utils.get_program("kraken", data["config"]),
                      "--db",db,"--quick",
                      "--preload","--min-hits","2","--threads",str(num_cores), 
                      "--out", out, files[0]," 2>",out_stats])
                do.run(cl,"kraken: %s" % data["name"][-1])
                if os.path.exists(kraken_out):
                    shutil.rmtree(kraken_out)
                shutil.move(tx_tmp_dir, kraken_out)
    metrics = _parse_kraken_output(kraken_out,db,data)
    return metrics
Exemple #6
0
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment" or
         (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)):
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config)

    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if bam.is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_path("picard", config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if out2 and os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Exemple #7
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. "
                   % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
Exemple #8
0
def genebody_coverage2(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts, takes a bam file,
    converts it to bigwig and then uses that
    """
    PROGRAM = "geneBody_coverage2.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    in_bigwig = bam2bigwig(in_file, config)
    prefix = "coverage"
    out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage")
    safe_makedir(out_dir)
    out_prefix = out_dir + "/wiggle"
    #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf"))
    do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None)
    return coverage_plot_file
Exemple #9
0
def _get_out_dir(in_file, config, out_prefix, prefix):
    if not out_prefix:
        out_dir = os.path.join(_results_dir(config),
                               os.path.basename(in_file),
                               prefix)
        safe_makedir(out_dir)
    return out_dir
Exemple #10
0
def run(in_file, ref, blastn_config, config):
    logger.info("Preparing the reference file for %s." % (ref.get("name")))
    ref_file = prepare_ref_file(ref, config)
    logger.info("Preparing the blast database for %s." % (ref.get("name")))
    blast_db = prepare_blast_db(ref_file, "nucl")
    logger.info("Blasting %s against %s." % (in_file, ref.get("name")))

    results_dir = build_results_dir(blastn_config, config)
    utils.safe_makedir(results_dir)

    out_file = os.path.join(results_dir,
                            replace_suffix(os.path.basename(in_file),
                                           ref.get("name") + "hits.tsv"))
    tmp_out = out_file + ".tmp"

    blast_results = blast_search(in_file, blast_db, tmp_out)
    #logger.info("Filtering results for at least %f percent of the "
    #            "sequences covered." %(0.5*100))
    #filtered_results = filter_results_by_length(blast_results, 0.5)
    #logger.info("Filtered output file here: %s" %(filtered_results))
    with open(blast_results) as in_handle:
        reader = csv.reader(in_handle, delimiter="\t")
        with open(out_file, "w") as out_handle:
            writer = csv.writer(out_handle, delimiter="\t")
            writer.writerow(HEADER_FIELDS.split(" "))
            for line in reader:
                writer.writerow(line)

    return out_file
Exemple #11
0
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config,
          rg_name=None):
    qual_format = config["algorithm"].get("quality_format", None)
    if qual_format is None or qual_format.lower() == "illumina":
        qual_flags = ["--solexa1.3-quals"]
    else:
        qual_flags = []
    cores = config.get("resources", {}).get("tophat", {}).get("cores", None)
    core_flags = ["-p", str(cores)] if cores else []
    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    out_file = os.path.join(out_dir, _out_fnames[0])
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            cl = [config["program"].get("tophat", "tophat")]
            cl += core_flags
            cl += qual_flags
            cl += ["-m", str(config["algorithm"].get("max_errors", 0)),
                   "--output-dir", tx_out_dir,
                   "--no-convert-bam"]
            if pair_file:
                d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file,
                                                        out_base, tx_out_dir, config)
                cl += ["--mate-inner-dist", str(d),
                       "--mate-std-dev", str(d_stdev)]
                files.append(pair_file)
            cl += files
            child = subprocess.check_call(cl)
    out_file_final = os.path.join(out_dir, "%s.sam" % out_base)
    if not os.path.exists(out_file_final):
        os.symlink(out_file, out_file_final)
    return out_file_final
Exemple #12
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Exemple #13
0
def _parse_novel(csv_file):
    """Create input of novel miRNAs from miRDeep2"""
    read = 0
    seen = set()
    safe_makedir("novel")
    with open("novel/hairpin.fa", "w") as fa_handle, open("novel/miRNA.str", "w") as str_handle:
        with open(csv_file) as in_handle:
            for line in in_handle:
                if line.startswith("mature miRBase miRNAs detected by miRDeep2"):
                    break
                if line.startswith("novel miRNAs predicted"):
                    read = 1
                    line = in_handle.next()
                    continue
                if read and line.strip():
                    cols = line.strip().split("\t")
                    name, start, score = cols[0], cols[16], cols[1]
                    m5p, m3p, pre = cols[13], cols[14], cols[15].replace('u','t').upper()
                    m5p_start = cols[15].find(m5p) + 1
                    m3p_start = cols[15].find(m3p) + 1
                    m5p_end = m5p_start + len(m5p) - 1
                    m3p_end = m3p_start + len(m3p) - 1
                    if m5p in seen:
                        continue
                    print >>fa_handle, (">new-{name} {start}\n{pre}").format(**locals())
                    print >>str_handle, (">new-{name} ({score}) [new-{name}-5p:{m5p_start}-{m5p_end}] [new-{name}-3p:{m3p_start}-{m3p_end}]").format(**locals())
                    seen.add(m5p)
Exemple #14
0
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(sailfish_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
        cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, sailfish_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Exemple #15
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Exemple #16
0
def variantcall_sample(data, region=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    safe_makedir(os.path.dirname(out_file))
    sam_ref = data["sam_ref"]
    config = data["config"]
    caller_fns = get_variantcallers()
    caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
    if isinstance(data["work_bam"], basestring):
        align_bams = [data["work_bam"]]
        items = [data]
    else:
        align_bams = data["work_bam"]
        items = data["work_items"]
    call_file = "%s-raw%s" % os.path.splitext(out_file)
    call_file = caller_fn(align_bams, items, sam_ref,
                          data["genome_resources"]["variation"],
                          region, call_file)
    if data["config"]["algorithm"].get("phasing", False) == "gatk":
        call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
    for ext in ["", ".idx"]:
        if not os.path.exists(out_file + ext):
            if os.path.exists(call_file + ext):
                try:
                    os.symlink(call_file + ext, out_file + ext)
                except OSError, msg:
                    if  str(msg).find("File exists") == -1:
                        raise
Exemple #17
0
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data):
    valid_algorithms = ["pseudo", "quasi"]
    assert algorithm in valid_algorithms, \
        "RapMap algorithm needs to be one of %s." % valid_algorithms
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data,
                                    rapmap_dir)
    num_cores = dd.get_num_cores(data)
    algorithm_subcommand = algorithm + "map"
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} "
    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += "-r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) "
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += "-1 {fq2_cmd} -2 {fq2_cmd} "
    with file_transaction(out_file) as tx_out_file:
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        run_message = ("%smapping %s and %s to %s with Rapmap. "
                       % (algorithm, fq1, fq2, rapmap_index))
        do.run(cmd.format(**locals()), run_message, None)
    return out_file
Exemple #18
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    requests.packages.urllib3.disable_warnings()
    last_mod = urllib.urlopen(url).info().getheader("Last-Modified")
    last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc())
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(db):
            is_new_version = True
        else:
            cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0]
            cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file))
            is_new_version = last_mod.date() > cur_version.date()
            if is_new_version:
                shutil.move(cur_file, cur_file.replace("minikraken", "old"))
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if is_new_version:
            if not os.path.exists(compress):
                subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            last_version = glob.glob(os.path.join(kraken, "minikraken_*"))
            utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
        else:
            print "You have the latest version %s." % last_mod
    else:
        raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
Exemple #19
0
 def test_combine(self):
     to_combine = self.config["to_combine"]
     out_file = "results/%s/combined_counts.counts" % (STAGENAME)
     safe_makedir(os.path.dirname(out_file))
     df = htseq_count.combine_counts(to_combine, None, out_file=out_file)
     self.assertTrue(os.path.exists(out_file))
     self.assertTrue(os.path.getsize(out_file) > 0)
Exemple #20
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
def prepare_bowtie_index(genome_fasta, bowtie_dir):
    if os.path.exists(bowtie_dir + ".1.bt2"):
        return bowtie_dir
    safe_makedir(bowtie_dir)
    cmd = "bowtie2-build {genome_fasta} {bowtie_dir}"
    subprocess.check_call(cmd.format(**locals()), shell=True)
    return bowtie_dir
Exemple #22
0
def sample_annotation(data):
    """
    Annotate miRNAs using miRBase database with seqbuster tool
    """
    names = data["rgnames"]['sample']
    tools = dd.get_expression_caller(data)
    work_dir = os.path.join(dd.get_work_dir(data), "mirbase")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = op.join(out_dir, names)
    if dd.get_mirbase_hairpin(data):
        mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data)))
        if utils.file_exists(data["collapse"]):
            data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data)
            data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config'])
        else:
            logger.debug("Trimmed collapsed file is empty for %s." % names)
    else:
        logger.debug("No annotation file from miRBase.")

    sps = dd.get_species(data) if dd.get_species(data) else "None"
    logger.debug("Looking for mirdeep2 database for %s" % names)
    if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")):
        data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps,  op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config'])

    if "trna" in tools:
        data['trna'] = _mint_trna_annotation(data)

    data = spikein.counts_spikein(data)
    return [[data]]
Exemple #23
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    if not file_exists(final_out):
        symlink_plus(out_file, final_out)
    return final_out
Exemple #24
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Exemple #25
0
def run(bam_file, data, out_dir):
    config = data["config"]
    if "picard" not in dd.get_tools_on(data):
        return {}
    ref_file = dd.get_ref_file(data)
    sample = dd.get_sample_name(data)
    target_file = dd.get_variant_regions(data)
    broad_runner = broad.PicardCmdRunner("picard", config)
    bam_fname = os.path.abspath(bam_file)
    path = os.path.dirname(bam_fname)
    utils.safe_makedir(out_dir)
    hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample)
    if utils.file_exists(hsmetric_file):
        return hsmetric_file
    with utils.chdir(out_dir):
        with tx_tmpdir() as tmp_dir:
            cur_bam = os.path.basename(bam_fname)
            if not os.path.exists(cur_bam):
                os.symlink(bam_fname, cur_bam)
            gen_metrics = PicardMetrics(broad_runner, tmp_dir)
            gen_metrics.report(cur_bam, ref_file,
                               bam.is_paired(bam_fname),
                               target_file, target_file, None, config)
    do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "")
    return hsmetric_file
def report_summary(samples, run_parallel):
    """
    Run coverage report with bcbiocov package
    """
    work_dir = dd.get_work_dir(samples[0][0])

    parent_dir = utils.safe_makedir(os.path.join(work_dir, "report"))
    qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma")
    with utils.chdir(parent_dir):

        logger.info("copy qsignature")
        if qsignature_fn:
            if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        out_dir = utils.safe_makedir("coverage")
        out_dir = utils.safe_makedir("variants")
        samples = run_parallel("coverage_report", samples)

        try:
            import bcbreport.prepare as bcbreport
            bcbreport.report(parent_dir)
        except:
            logger.info("skipping report. No bcbreport installed.")
            pass

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

    return samples
Exemple #27
0
def run(items):
    """Perform detection of structural variations with Manta.
    """
    paired = vcfutils.get_paired(items)
    data = paired.tumor_data if paired else items[0]
    work_dir = _sv_workdir(data)
    variant_file = _get_out_file(work_dir, paired)
    if not utils.file_exists(variant_file):
        with file_transaction(data, work_dir) as tx_work_dir:
            utils.safe_makedir(tx_work_dir)
            tx_workflow_file = _prep_config(items, paired, tx_work_dir)
            _run_workflow(items, paired, tx_workflow_file, tx_work_dir)
    assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file
    variant_file = shared.annotate_with_depth(variant_file, items)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "break-point-inspector" in dd.get_tools_on(data):
            if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(data):
                variant_file = _run_break_point_inspector(data, variant_file, paired, work_dir)
        if "sv" not in data:
            data["sv"] = []
        final_vcf = shared.finalize_sv(variant_file, data, items)
        vc = {"variantcaller": "manta",
              "do_upload": upload_counts[final_vcf] == 0,  # only upload a single file per batch
              "vrn_file": final_vcf}
        evidence_bam = _get_evidence_bam(work_dir, data)
        if evidence_bam:
            vc["read_evidence"] = evidence_bam
        data["sv"].append(vc)
        upload_counts[final_vcf] += 1
        out.append(data)
    return out
Exemple #28
0
def summarize(calls, data, items):
    """Summarize results from multiple callers into a single flattened BED file.

    Approach:
      - Combine all calls found in all files
      - Filter files retaining those present with multiple levels of support.
      - Remove calls in high depth regions.
      - Remove calls with ends overlapping exclusion regions like low complexity regions.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    with shared.bedtools_tmpdir(data):
        input_beds = filter(lambda xs: xs[1] is not None and utils.file_exists(xs[1]),
                            [(c["variantcaller"], _create_bed(c, sample, work_dir, calls, data)) for c in calls])
    if len(input_beds) > 0:
        out_file = combine_bed_by_size([xs[1] for xs in input_beds], sample, work_dir, data)
        if utils.file_exists(out_file):
            if len(input_beds) > N_FILTER_CALLERS:
                filter_file = _filter_ensemble(out_file, data)
            else:
                filter_file = out_file
            limit_file = shared.remove_highdepth_regions(filter_file, items)
            exclude_files = [f for f in [x.get("exclude_file") for x in calls] if f]
            exclude_file = exclude_files[0] if len(exclude_files) > 0 else None
            if exclude_file:
                noexclude_file, _ = sshared.exclude_by_ends(limit_file, exclude_file, data)
            else:
                noexclude_file = limit_file
            bedprep_dir = utils.safe_makedir(os.path.join(os.path.dirname(noexclude_file), "bedprep"))
            if utils.file_exists(noexclude_file):
                calls.append({"variantcaller": "sv-ensemble",
                              "input_beds": input_beds,
                              "vrn_file": bedutils.clean_file(noexclude_file, data, bedprep_dir=bedprep_dir)})
    return calls
Exemple #29
0
def run(bam_file, data, out_dir):
    out = {}
    preseq_cmd = tz.get_in(["config", "algorithm", "preseq"], data)
    if not preseq_cmd:
        return out

    samtools_stats_dir = os.path.join(out_dir, os.path.pardir, "samtools")
    samtools_stats = samtools.run(bam_file, data, samtools_stats_dir)

    stats_file = os.path.join(out_dir, "%s.txt" % dd.get_sample_name(data))
    if not utils.file_exists(stats_file):
        utils.safe_makedir(out_dir)
        preseq = config_utils.get_program("preseq", data["config"])
        params = _get_preseq_params(data, preseq_cmd, int(samtools_stats["Total_reads"]))
        param_line = "{options} -step {step} -seg_len {seg_len} "
        if preseq_cmd == "lc_extrap":
            param_line += "-extrap {extrap} "
        param_line = param_line.format(**params)
        with file_transaction(data, stats_file) as tx_out_file:
            cmd = "{preseq} {preseq_cmd} -bam -pe {bam_file} -o {tx_out_file} {param_line}".format(**locals())
            do.run(cmd.format(**locals()), "preseq " + preseq_cmd, data)

    out = _prep_real_counts(bam_file, data, samtools_stats)

    return {"base": stats_file,
            "metrics": out}
Exemple #30
0
def variantcall_sample(data, region=None, align_bams=None, out_file=None):
    """Parallel entry point for doing genotyping of a region of a sample.
    """
    if out_file is None or not os.path.exists(out_file) or not os.path.lexists(out_file):
        utils.safe_makedir(os.path.dirname(out_file))
        sam_ref = data["sam_ref"]
        config = data["config"]
        caller_fns = get_variantcallers()
        caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")]
        if len(align_bams) == 1:
            items = [data]
        else:
            items = multi.get_orig_items(data)
            assert len(items) == len(align_bams)
        assoc_files = tz.get_in(("genome_resources", "variation"), data, {})
        if not assoc_files: assoc_files = {}
        for bam_file in align_bams:
            bam.index(bam_file, data["config"], check_timestamp=False)
        do_phasing = data["config"]["algorithm"].get("phasing", False)
        call_file = "%s-raw%s" % utils.splitext_plus(out_file) if do_phasing else out_file
        call_file = caller_fn(align_bams, items, sam_ref, assoc_files, region, call_file)
        if do_phasing == "gatk":
            call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config)
            utils.symlink_plus(call_file, out_file)
    if region:
        data["region"] = region
    data["vrn_file"] = out_file
    return [data]
Exemple #31
0
def compare_to_rm(data):
    """Compare final variant calls against reference materials of known calls.
    """
    if isinstance(data, (list, tuple)):
        data = _normalize_cwl_inputs(data)
    toval_data = _get_validate(data)
    toval_data = cwlutils.unpack_tarballs(toval_data, toval_data)
    if toval_data:
        caller = _get_caller(toval_data)
        sample = dd.get_sample_name(toval_data)
        base_dir = utils.safe_makedir(
            os.path.join(toval_data["dirs"]["work"], "validate", sample,
                         caller))

        if isinstance(toval_data["vrn_file"], (list, tuple)):
            raise NotImplementedError(
                "Multiple input files for validation: %s" %
                toval_data["vrn_file"])
        else:
            vrn_file = os.path.abspath(toval_data["vrn_file"])
        rm_file = normalize_input_path(
            toval_data["config"]["algorithm"]["validate"], toval_data)
        rm_interval_file = _gunzip(
            normalize_input_path(
                toval_data["config"]["algorithm"].get("validate_regions"),
                toval_data), toval_data)
        rm_interval_file = bedutils.clean_file(
            rm_interval_file,
            toval_data,
            prefix="validateregions-",
            bedprep_dir=utils.safe_makedir(os.path.join(base_dir, "bedprep")))
        rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(toval_data),
                                         data.get("genome_build"), base_dir,
                                         data)
        rm_interval_file = (naming.handle_synonyms(
            rm_interval_file, dd.get_ref_file(toval_data),
            data.get("genome_build"), base_dir, data)
                            if rm_interval_file else None)
        vmethod = tz.get_in(["config", "algorithm", "validate_method"], data,
                            "rtg")
        # RTG can fail on totally empty files. Call everything in truth set as false negatives
        if not vcfutils.vcf_has_variants(vrn_file):
            eval_files = _setup_call_false(rm_file, rm_interval_file, base_dir,
                                           toval_data, "fn")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        # empty validation file, every call is a false positive
        elif not vcfutils.vcf_has_variants(rm_file):
            eval_files = _setup_call_fps(vrn_file, rm_interval_file, base_dir,
                                         toval_data, "fp")
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod in ["rtg", "rtg-squash-ploidy"]:
            eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file,
                                       base_dir, toval_data, vmethod)
            eval_files = _annotate_validations(eval_files, toval_data)
            data["validate"] = _rtg_add_summary_file(eval_files, base_dir,
                                                     toval_data)
        elif vmethod == "hap.py":
            data["validate"] = _run_happy_eval(vrn_file, rm_file,
                                               rm_interval_file, base_dir,
                                               toval_data)
        elif vmethod == "bcbio.variation":
            data["validate"] = _run_bcbio_variation(vrn_file, rm_file,
                                                    rm_interval_file, base_dir,
                                                    sample, caller, toval_data)
    return [[data]]
Exemple #32
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "structural",
                     dd.get_sample_name(data), "purecn"))
Exemple #33
0
def tophat_align(fastq_file,
                 pair_file,
                 ref_file,
                 out_base,
                 align_dir,
                 data,
                 names=None):
    """
    run alignment using Tophat v2
    """
    config = data["config"]
    options = get_in(config, ("resources", "tophat", "options"), {})
    options = _set_fusion_mode(options, config)
    options = _set_quality_flag(options, data)
    options = _set_transcriptome_option(options, data, ref_file)
    options = _set_cores(options, config)
    options = _set_rg_options(options, names)
    options = _set_stranded_flag(options, config)

    ref_file, runner = _determine_aligner_and_reference(ref_file, config)

    # fusion search does not work properly with Bowtie2
    if options.get("fusion-search", False):
        ref_file = ref_file.replace("/bowtie2", "/bowtie")

    if _tophat_major_version(config) == 1:
        raise NotImplementedError(
            "Tophat versions < 2.0 are not supported, please "
            "download the newest version of Tophat here: "
            "http://tophat.cbcb.umd.edu")

    if _ref_version(ref_file) == 1 or options.get("fusion-search", False):
        options["bowtie1"] = True

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out

    out_file = os.path.join(out_dir, "accepted_hits.bam")
    unmapped = os.path.join(out_dir, "unmapped.bam")
    files = [ref_file, fastq_file]
    if not file_exists(out_file):
        with file_transaction(config, out_dir) as tx_out_dir:
            safe_makedir(tx_out_dir)
            if pair_file and not options.get("mate-inner-dist", None):
                d, d_stdev = _estimate_paired_innerdist(
                    fastq_file, pair_file, ref_file, out_base, tx_out_dir,
                    data)
                options["mate-inner-dist"] = d
                options["mate-std-dev"] = d_stdev
                files.append(pair_file)
            options["output-dir"] = tx_out_dir
            options["no-coverage-search"] = True
            options["no-mixed"] = True
            cmd = [sys.executable, config_utils.get_program("tophat", config)]
            for k, v in options.items():
                if v is True:
                    cmd.append("--%s" % k)
                else:
                    assert not isinstance(v, bool)
                    cmd.append("--%s=%s" % (k, v))
            # tophat requires options before arguments, otherwise it silently ignores them
            cmd += files
            do.run(cmd,
                   "Running Tophat on %s and %s." % (fastq_file, pair_file))
    if pair_file and _has_alignments(out_file):
        fixed = _fix_mates(out_file,
                           os.path.join(out_dir, "%s-align.bam" % out_base),
                           ref_file, config)
    else:
        fixed = out_file
    fixed_unmapped = _fix_unmapped(fixed, unmapped, data)
    fixed = merge_unmapped(fixed, fixed_unmapped, config)
    fixed = _add_rg(fixed, config, names)
    fixed = bam.sort(fixed, config)
    picard = broad.runner_from_path("picard", config)
    # set the contig order to match the reference file so GATK works
    fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"],
                          os.path.splitext(fixed)[0] + ".picard.bam")
    fixed = fix_insert_size(fixed, config)
    if not file_exists(final_out):
        symlink_plus(fixed, final_out)
    return final_out
Exemple #34
0
def setup(args):
    template, template_txt = name_to_config(args.template)
    run_info.validate_yaml(template_txt, args.template)
    base_item = template["details"][0]
    project_name, metadata, global_vars, md_file = _pname_and_metadata(
        args.metadata)
    remotes = _retrieve_remote([args.metadata, args.template])
    inputs = args.input_files + remotes.get("inputs",
                                            []) + _find_remote_inputs(metadata)
    remote_retriever = None
    remote_config = None
    if hasattr(args, "systemconfig") and args.systemconfig and hasattr(
            args, "integrations"):
        config, _ = config_utils.load_system_config(args.systemconfig)
        for iname, retriever in args.integrations.items():
            if iname in config:
                remote_retriever = retriever
                remote_config = remote_retriever.set_cache(config[iname])
                inputs += remote_retriever.get_files(metadata, remote_config)
    raw_items = [
        _add_metadata(item, metadata, remotes, args.only_metadata)
        for item in _prep_items_from_base(
            base_item, inputs, args.separators.split(","), args.force_single)
    ]
    items = [x for x in raw_items if x]
    _check_all_metadata_found(metadata, items)
    if remote_retriever and remote_config:
        items = remote_retriever.add_remotes(items, remote_config)
    out_dir = os.path.join(os.getcwd(), project_name)
    work_dir = utils.safe_makedir(os.path.join(out_dir, "work"))
    if hasattr(args, "relpaths") and args.relpaths:
        items = [_convert_to_relpaths(x, work_dir) for x in items]
    out_config_file = _write_template_config(template_txt, project_name,
                                             out_dir)
    if md_file:
        shutil.copyfile(
            md_file, os.path.join(out_dir, "config",
                                  os.path.basename(md_file)))
    items = _copy_to_configdir(items, out_dir)
    if len(items) == 0:
        print()
        print("Template configuration file created at: %s" % out_config_file)
        print(
            "Edit to finalize custom options, then prepare full sample config with:"
        )
        print("  bcbio_nextgen.py -w template %s %s sample1.bam sample2.fq" % \
            (out_config_file, project_name))
    else:
        out_config_file = _write_config_file(items, global_vars, template,
                                             project_name, out_dir, remotes)
        print()
        print("Configuration file created at: %s" % out_config_file)
        print("Edit to finalize and run with:")
        print("  cd %s" % work_dir)
        print("  bcbio_nextgen.py ../config/%s" %
              os.path.basename(out_config_file))
        if remotes.get("base"):
            remote_path = os.path.join(remotes["base"],
                                       os.path.basename(out_config_file))
            s3.upload_file_boto(out_config_file, remote_path)
            print("Also uploaded to AWS S3 in %s" % remotes["base"])
            print("Run directly with bcbio_vm.py run %s" % remote_path)
Exemple #35
0
def run(bam_file, data, fastqc_out):
    """Run fastqc, generating report in specified directory and parsing metrics.

    Downsamples to 10 million reads to avoid excessive processing times with large
    files, unless we're running a Standard/smallRNA-seq/QC pipeline.

    Handles fastqc 0.11+, which use a single HTML file and older versions that use
    a directory of files + images. The goal is to eventually move to only 0.11+
    """
    sentry_file = os.path.join(fastqc_out, "fastqc_report.html")
    if not os.path.exists(sentry_file):
        work_dir = os.path.dirname(fastqc_out)
        utils.safe_makedir(work_dir)
        ds_file = (bam.downsample(bam_file, data, 1e7, work_dir=work_dir)
                   if data.get("analysis", "").lower()
                   not in ["standard", "smallrna-seq"] else None)
        if ds_file is not None:
            bam_file = ds_file
        frmt = "bam" if bam_file.endswith("bam") else "fastq"
        fastqc_name = utils.splitext_plus(os.path.basename(bam_file))[0]
        fastqc_clean_name = dd.get_sample_name(data)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        with tx_tmpdir(data, work_dir) as tx_tmp_dir:
            with utils.chdir(tx_tmp_dir):
                cl = [
                    config_utils.get_program("fastqc", data["config"]), "-d",
                    tx_tmp_dir, "-t",
                    str(num_cores), "--extract", "-o", tx_tmp_dir, "-f", frmt,
                    bam_file
                ]
                cl = "%s %s" % (utils.local_path_export(), " ".join(
                    [str(x) for x in cl]))
                do.run(cl, "FastQC: %s" % dd.get_sample_name(data))
                tx_fastqc_out = os.path.join(tx_tmp_dir,
                                             "%s_fastqc" % fastqc_name)
                tx_combo_file = os.path.join(tx_tmp_dir,
                                             "%s_fastqc.html" % fastqc_name)
                if not os.path.exists(sentry_file) and os.path.exists(
                        tx_combo_file):
                    utils.safe_makedir(fastqc_out)
                    # Use sample name for reports instead of bam file name
                    with open(os.path.join(tx_fastqc_out, "fastqc_data.txt"), 'r') as fastqc_bam_name, \
                            open(os.path.join(tx_fastqc_out, "_fastqc_data.txt"), 'w') as fastqc_sample_name:
                        for line in fastqc_bam_name:
                            fastqc_sample_name.write(
                                line.replace(os.path.basename(bam_file),
                                             fastqc_clean_name))
                    shutil.move(
                        os.path.join(tx_fastqc_out, "_fastqc_data.txt"),
                        os.path.join(fastqc_out, 'fastqc_data.txt'))
                    shutil.move(tx_combo_file, sentry_file)
                    if os.path.exists("%s.zip" % tx_fastqc_out):
                        shutil.move(
                            "%s.zip" % tx_fastqc_out,
                            os.path.join(fastqc_out,
                                         "%s.zip" % fastqc_clean_name))
                elif not os.path.exists(sentry_file):
                    raise ValueError(
                        "FastQC failed to produce output HTML file: %s" %
                        os.listdir(tx_tmp_dir))
    logger.info("Produced HTML report %s" % sentry_file)
    parser = FastQCParser(fastqc_out, dd.get_sample_name(data))
    stats = parser.get_fastqc_summary()
    parser.save_sections_into_file()
    return stats
Exemple #36
0
 def test_run(self):
     out_prefix = "results/tests/dss/test_dss"
     safe_makedir(os.path.dirname(out_prefix))
     result = dss.run(self.count_file, self.conds, ("untreat", "treat"), out_prefix=out_prefix)
     self.assertTrue(file_exists(result))
Exemple #37
0
def summarize_grading(samples, vkey="validate"):
    """Provide summaries of grading results across all samples.

    Handles both traditional pipelines (validation part of variants) and CWL
    pipelines (validation at top level)
    """
    samples = list(utils.flatten(samples))
    if not _has_grading_info(samples, vkey):
        return [[d] for d in samples]
    validate_dir = utils.safe_makedir(
        os.path.join(samples[0]["dirs"]["work"], vkey))
    header = ["sample", "caller", "variant.type", "category", "value"]
    _summarize_combined(samples, vkey)
    validated, out = _group_validate_samples(
        samples, vkey, (["metadata", "validate_batch"], ["metadata", "batch"
                                                         ], ["description"]))
    for vname, vitems in validated.items():
        out_csv = os.path.join(validate_dir, "grading-summary-%s.csv" % vname)
        with open(out_csv, "w") as out_handle:
            writer = csv.writer(out_handle)
            writer.writerow(header)
            plot_data = []
            plot_files = []
            for data in sorted(
                    vitems,
                    key=lambda x: x.get("lane", dd.get_sample_name(x))):
                validations = [
                    variant.get(vkey) for variant in data.get("variants", [])
                    if isinstance(variant, dict)
                ]
                validations = [v for v in validations if v]
                if len(validations) == 0 and vkey in data:
                    validations = [data.get(vkey)]
                for validate in validations:
                    if validate:
                        validate["grading_summary"] = out_csv
                        if validate.get("grading"):
                            for row in _get_validate_plotdata_yaml(
                                    validate["grading"], data):
                                writer.writerow(row)
                                plot_data.append(row)
                        elif validate.get("summary") and not validate.get(
                                "summary") == "None":
                            if isinstance(validate["summary"], (list, tuple)):
                                plot_files.extend(
                                    list(set(validate["summary"])))
                            else:
                                plot_files.append(validate["summary"])
        if plot_files:
            plots = validateplot.classifyplot_from_plotfiles(
                plot_files, out_csv)
        elif plot_data:
            plots = validateplot.create(plot_data, header, 0, data["config"],
                                        os.path.splitext(out_csv)[0])
        else:
            plots = []
        for data in vitems:
            if data.get(vkey):
                data[vkey]["grading_plots"] = plots
            for variant in data.get("variants", []):
                if isinstance(variant, dict) and variant.get(vkey):
                    variant[vkey]["grading_plots"] = plots
            out.append([data])
    return out
Exemple #38
0
def _run_info_from_yaml(dirs,
                        run_info_yaml,
                        config,
                        sample_names=None,
                        integrations=None):
    """Read run information from a passed YAML file.
    """
    validate_yaml(run_info_yaml, run_info_yaml)
    with open(run_info_yaml) as in_handle:
        loaded = yaml.load(in_handle)
    fc_name, fc_date = None, None
    if dirs.get("flowcell"):
        try:
            fc_name, fc_date = flowcell.parse_dirname(dirs.get("flowcell"))
        except ValueError:
            pass
    global_config = {}
    global_vars = {}
    resources = {}
    integration_config = {}
    if isinstance(loaded, dict):
        global_config = copy.deepcopy(loaded)
        del global_config["details"]
        if "fc_name" in loaded:
            fc_name = loaded["fc_name"].replace(" ", "_")
        if "fc_date" in loaded:
            fc_date = str(loaded["fc_date"]).replace(" ", "_")
        global_vars = global_config.pop("globals", {})
        resources = global_config.pop("resources", {})
        for iname in ["arvados"]:
            integration_config[iname] = global_config.pop(iname, {})
        loaded = loaded["details"]
    if sample_names:
        loaded = [x for x in loaded if x["description"] in sample_names]

    if integrations:
        for iname, retriever in integrations.items():
            if iname in config:
                config[iname] = retriever.set_cache(config[iname])
                loaded = retriever.add_remotes(loaded, config[iname])

    run_details = []
    for i, item in enumerate(loaded):
        item = _normalize_files(item, dirs.get("flowcell"))
        if "lane" not in item:
            item["lane"] = str(i + 1)
        item["lane"] = _clean_characters(str(item["lane"]))
        if "description" not in item:
            if _item_is_bam(item):
                item["description"] = get_sample_name(item["files"][0])
            else:
                raise ValueError(
                    "No `description` sample name provided for input #%s" %
                    (i + 1))
        item["description"] = _clean_characters(str(item["description"]))
        if "upload" not in item:
            upload = global_config.get("upload", {})
            # Handle specifying a local directory directly in upload
            if isinstance(upload, basestring):
                upload = {"dir": upload}
            if fc_name:
                upload["fc_name"] = fc_name
            if fc_date:
                upload["fc_date"] = fc_date
            upload["run_id"] = ""
            if upload.get("dir"):
                upload["dir"] = _file_to_abs(upload["dir"], [dirs.get("work")],
                                             makedir=True)
            item["upload"] = upload
        item["algorithm"] = _replace_global_vars(item["algorithm"],
                                                 global_vars)
        item["algorithm"] = genome.abs_file_paths(
            item["algorithm"],
            ignore_keys=ALGORITHM_NOPATH_KEYS,
            fileonly_keys=ALGORITHM_FILEONLY_KEYS,
            do_download=all(not x for x in integrations.values()))
        item["genome_build"] = str(item.get("genome_build", ""))
        item["algorithm"] = _add_algorithm_defaults(item["algorithm"])
        item["metadata"] = add_metadata_defaults(item.get("metadata", {}))
        item["rgnames"] = prep_rg_names(item, config, fc_name, fc_date)
        if item.get("files"):
            item["files"] = [
                genome.abs_file_paths(
                    f, do_download=all(not x for x in integrations.values()))
                for f in item["files"]
            ]
        elif "files" in item:
            del item["files"]
        if item.get("vrn_file") and isinstance(item["vrn_file"], basestring):
            inputs_dir = utils.safe_makedir(
                os.path.join(dirs.get("work", os.getcwd()), "inputs",
                             item["description"]))
            item["vrn_file"] = genome.abs_file_paths(
                item["vrn_file"],
                do_download=all(not x for x in integrations.values()))
            if os.path.isfile(item["vrn_file"]):
                item["vrn_file"] = vcfutils.bgzip_and_index(item["vrn_file"],
                                                            config,
                                                            remove_orig=False,
                                                            out_dir=inputs_dir)
            if not tz.get_in(("metadata", "batch"), item):
                raise ValueError(
                    "%s: Please specify a metadata batch for variant file (vrn_file) input.\n"
                    % (item["description"]) +
                    "Batching with a standard sample provides callable regions for validation."
                )
        item = _clean_metadata(item)
        item = _clean_algorithm(item)
        # Add any global resource specifications
        if "resources" not in item:
            item["resources"] = {}
        for prog, pkvs in resources.items():
            if prog not in item["resources"]:
                item["resources"][prog] = {}
            if pkvs is not None:
                for key, val in pkvs.items():
                    item["resources"][prog][key] = val
        for iname, ivals in integration_config.items():
            if ivals:
                if iname not in item:
                    item[iname] = {}
                for k, v in ivals.items():
                    item[iname][k] = v

        run_details.append(item)
    _check_sample_config(run_details, run_info_yaml, config)
    return run_details
Exemple #39
0
def work_dir(data):
    return utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "coverage",
                     dd.get_sample_name(data), "sambamba"))
Exemple #40
0
                        default=[],
                        action="append")
    parser.add_argument("-q", "--queue", help="Queue to submit jobs to.")
    parser.add_argument("-p",
                        "--tag",
                        help="Tag name to label jobs on the cluster",
                        default="bcb-prep")
    parser.add_argument("-t",
                        "--paralleltype",
                        choices=["local", "ipython"],
                        default="local",
                        help="Run with iptyhon")

    args = parser.parse_args()
    out_dir = os.path.abspath(args.out)
    utils.safe_makedir(out_dir)
    try:
        system_config = os.path.join(_get_data_dir(), "galaxy",
                                     "bcbio_system.yaml")
    except ValueError as err:
        print(err)
        print(
            "WARNING: Attempting to read bcbio_system.yaml in the current directory."
        )
        system_config = "bcbio_system.yaml"

    if utils.file_exists(system_config):
        with open(system_config) as in_handle:
            config = yaml.load(in_handle)
    else:
        print("WARNING: bcbio_system.yaml not found, creating own resources.")
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, contamination, coverage, damage, fastqc,
                          kraken, qsignature, qualimap, samtools, picard, srna,
                          umi, variant, viral, preseq, chipseq)
    tools = {
        "fastqc": fastqc.run,
        "atropos": atropos.run,
        "small-rna": srna.run,
        "samtools": samtools.run,
        "qualimap": qualimap.run,
        "qualimap_rnaseq": qualimap.run_rnaseq,
        "qsignature": qsignature.run,
        "contamination": contamination.run,
        "coverage": coverage.run,
        "damage": damage.run,
        "variants": variant.run,
        "peddy": peddy.run_qc,
        "kraken": kraken.run,
        "picard": picard.run,
        "umi": umi.run,
        "viral": viral.run,
        "preseq": preseq.run,
        "chipqc": chipseq.run
    }
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemple #42
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        if bam.is_paired(dd.get_work_bam(data)):
            in_bam = tz.get_in(("atac", "align", "NF"), data)
        else:
            in_bam = tz.get_in(("atac", "align", "full"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    if not consensus_file:
        return [[data]]
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
            else:
                data = tz.assoc_in(data, ("peak_counts", "full"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count_file)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        if bam.is_paired(dd.get_work_bam(data)):
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        else:
            data = tz.assoc_in(data, ("peak_counts", "full"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count_file)
    return [[data]]
Exemple #43
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                if vcinfo["vrn_file"] and vcfutils.vcf_has_nonfiltered_variants(
                        vcinfo["vrn_file"]):
                    vcf_file = vcinfo["vrn_file"]
                    break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not vcfanno.is_human(data):
        if not peddy:
            reason = "peddy executable not found"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            cmd = (
                "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return ((l.find("IndexError") >= 0
                             and l.find("is out of bounds for axis") >= 0) or
                            (l.find("n_components=") >= 0 and
                             l.find("must be between 1 and n_features=") >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Exemple #44
0
def _get_program_file(dirs):
    if dirs.get("work"):
        base_dir = utils.safe_makedir(os.path.join(dirs["work"], "provenance"))
        return os.path.join(base_dir, "programs.txt")
Exemple #45
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "structural",
                     dd.get_sample_name(data), "cnvkit"))
Exemple #46
0
def _align(data, fastq_file, args):
    work_dir = os.path.join("align")
    work_dir = os.path.abspath(safe_makedir(work_dir))
    out_prefix = os.path.join(work_dir, "seqs_")
    bam_file = star_align(data, args, fastq_file, out_prefix, 1000)
    return bam_file
Exemple #47
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " %
           " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file)
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += (" --chimSegmentMin 15 --chimJunctionOverhangMin 15 "
                "--chimOutType WithinSAM ")
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data):
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Exemple #48
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(filter(lambda x: x is not None,
                                 [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(cnns,
                                                 os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                       tz.get_in(["depth", "bins", "antitarget"], data),
                                       back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
 def setUp(self):
     self.out_dir = "test_picard"
     safe_makedir(self.out_dir)
Exemple #50
0
def _run_cnvkit_shared_orig(inputs, backgrounds):
    """Original CNVkit implementation with full normalization and segmentation.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input),
                                                 cur_raw_work_dir, cur_input)
        if utils.file_exists(out_base_old + ".cns"):
            out_base = out_base_old
        ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base})
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                        zip(["evaluate"] * len(inputs), inputs)
        # New style shared SV bins
        if tz.get_in(["depth", "bins", "target"], inputs[0]):
            target_bed = tz.get_in(["depth", "bins", "target"], inputs[0])
            antitarget_bed = tz.get_in(["depth", "bins", "antitarget"],
                                       inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_general_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Back compatible with pre-existing runs
        else:
            target_bed, antitarget_bed = _get_original_targets(inputs[0])
            raw_coverage_cnns = reduce(operator.add, [
                _get_original_coverage(cdata, itype)
                for itype, cdata in samples_to_run
            ])
        # Currently metrics not calculated due to speed and needing re-evaluation
        # We could re-enable with larger truth sets to evaluate background noise
        # But want to reimplement in a more general fashion as part of normalization
        if False:
            coverage_cnns = reduce(operator.add, [
                _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                                inputs + backgrounds)
                for cnns in tz.groupby("bam", raw_coverage_cnns).values()
            ])
            background_cnn = cnvkit_background(
                _select_background_cnns(coverage_cnns), background_cnn, inputs,
                target_bed, antitarget_bed)
        else:
            coverage_cnns = raw_coverage_cnns
            background_cnn = cnvkit_background([
                x["file"] for x in coverage_cnns if x["itype"] == "background"
            ], background_cnn, inputs, target_bed, antitarget_bed)
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby(
                "bam", [x for x in coverage_cnns
                        if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [
            _cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds)
            for cnr, data in fixed_cnrs
        ]
    return ckouts
Exemple #51
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info(
            "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
            "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if to_show[-1].find("IndexError") >= 0 and to_show[-1].find(
                    "is out of bounds for axis") >= 0:
                logger.info(
                    "Skipping peddy because no variants overlap with checks: %s"
                    % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write(
                        "peddy did not find overlaps with 1kg sites in VCF, skipping"
                    )
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Exemple #52
0
def main(config_file, view):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    input_dir = config["dir"]["data"]
    logger.info("Loading files from %s" % (input_dir))
    input_files = list(locate("*.fq", input_dir))
    input_files += list(locate("*.fastq", input_dir))
    logger.info("Input files: %s" % (input_files))

    results_dir = config["dir"]["results"]
    safe_makedir(results_dir)

    # make the stage repository
    repository = StageRepository(config)
    logger.info("Stages found: %s" % (repository.plugins))

    if config.get("test_pipeline", False):
        logger.info("Running a test pipeline on a subset of the reads.")
        results_dir = os.path.join(results_dir, "test_pipeline")
        config["dir"]["results"] = results_dir
        safe_makedir(results_dir)
        curr_files = map(make_test, input_files, [config] * len(input_files))
        logger.info("Converted %s to %s. " % (input_files, curr_files))
    else:
        curr_files = input_files
        logger.info("Running RNASeq alignment pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "fastqc":
            logger.info("Running fastqc on %s." % (curr_files))
            stage_runner = FastQC(config)
            view.map(stage_runner, curr_files)

        if stage == "cutadapt":
            curr_files = combine_pairs(curr_files)
            logger.info("Running cutadapt on %s." % (curr_files))
            stage_runner = Cutadapt(config)
            curr_files = view.map(stage_runner, curr_files)

        if stage == "tophat":
            logger.info("Running Tophat on %s." % (curr_files))
            #tophat = repository["tophat"](config)
            tophat = Tophat(config)
            tophat_outputs = view.map(tophat, curr_files)
            bamfiles = view.map(sam.sam2bam, tophat_outputs)
            bamsort = view.map(sam.bamsort, bamfiles)
            view.map(sam.bamindex, bamsort)
            final_bamfiles = bamsort
            curr_files = tophat_outputs

        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = repository[stage](config)
            view.map(disambiguate, curr_files)

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s." % (bamfiles))
            name_sorted = view.map(sam.bam_name_sort, bamfiles)
            curr_files = view.map(sam.bam2sam, name_sorted)
            htseq_args = zip(*product(curr_files, [config], [stage]))
            htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args)
            htseq_count.combine_counts(htseq_outputs)

        if stage == "rnaseq_metrics":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            #coverage = repository[stage](config)
            coverage = RNASeqMetrics(config)
            view.map(coverage, curr_files)

        if stage == "rseqc":
            logger.info("Running rseqc on %s." % (curr_files))
            #rseq_args = zip(*product(curr_files, [config]))
            rseq_args = zip(*product(final_bamfiles, [config]))
            view.map(rseqc.bam_stat, *rseq_args)
            view.map(rseqc.genebody_coverage, *rseq_args)
            view.map(rseqc.junction_annotation, *rseq_args)
            view.map(rseqc.junction_saturation, *rseq_args)
            RPKM_args = zip(*product(final_bamfiles, [config]))
            RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
            RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                        RPKM_count_out)
            """
                            annotate_args = zip(*product(RPKM_count_fixed,
                                         ["gene_id"],
                                         ["ensembl_gene_id"],
                                         ["human"]))
            view.map(annotate.annotate_table_with_biomart,
                     *annotate_args)
                     """
            view.map(rseqc.RPKM_saturation, *rseq_args)
            curr_files = tophat_outputs
Exemple #53
0
def _run_cnvkit_shared(inputs, backgrounds):
    """Shared functionality to run CNVkit, parallelizing over multiple BAM files.
    """
    work_dir = _sv_workdir(inputs[0])
    raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
    background_name = dd.get_sample_name(
        backgrounds[0]) if backgrounds else "flat"
    background_cnn = os.path.join(raw_work_dir,
                                  "%s_background.cnn" % (background_name))
    ckouts = []
    for cur_input in inputs:
        cur_raw_work_dir = utils.safe_makedir(
            os.path.join(_sv_workdir(cur_input), "raw"))
        out_base = _bam_to_outbase(dd.get_align_bam(cur_input),
                                   cur_raw_work_dir)
        ckouts.append({
            "cnr": "%s.cnr" % out_base,
            "cns": "%s.cns" % out_base,
            "back_cnn": background_cnn
        })
    if not utils.file_exists(ckouts[0]["cns"]):
        cov_interval = dd.get_coverage_interval(inputs[0])
        raw_target_bed, access_bed = _get_target_access_files(
            cov_interval, inputs[0], work_dir)
        # bail out if we ended up with no regions
        if not utils.file_exists(raw_target_bed):
            return {}
        raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0])
        parallel = {
            "type": "local",
            "cores": dd.get_cores(inputs[0]),
            "progs": ["cnvkit"]
        }
        pct_coverage = (
            pybedtools.BedTool(raw_target_bed).total_coverage() /
            float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0
        target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed,
                                                     access_bed, cov_interval,
                                                     pct_coverage,
                                                     raw_work_dir, inputs[0])
        samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \
                         zip(["evaluate"] * len(inputs), inputs)
        raw_coverage_cnns = [
            _cnvkit_coverage(cdata, bed, itype)
            for itype, cdata in samples_to_run
            for bed in [target_bed, antitarget_bed]
        ]
        coverage_cnns = reduce(operator.add, [
            _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval,
                            inputs + backgrounds)
            for cnns in tz.groupby("bam", raw_coverage_cnns).values()
        ])
        background_cnn = _cnvkit_background(
            _select_background_cnns(coverage_cnns), background_cnn, target_bed,
            antitarget_bed, inputs[0])
        fixed_cnrs = run_multicore(
            _cnvkit_fix,
            [(cnns, background_cnn, inputs + backgrounds)
             for cnns in tz.groupby(
                 "bam", [x for x in coverage_cnns
                         if x["itype"] == "evaluate"]).values()],
            inputs[0]["config"], parallel)
        [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs]
    return ckouts
def prep_cwl(samples,
             workflow_fn,
             out_dir,
             out_file,
             integrations=None,
             add_container_tag=None):
    """Output a CWL description with sub-workflows and steps.
    """
    if add_container_tag is None:
        container_tags = None
    elif add_container_tag.lower() == "quay_lookup":
        container_tags = {}
    else:
        container_tags = collections.defaultdict(lambda: add_container_tag)
    step_dir = utils.safe_makedir(os.path.join(out_dir, "steps"))
    get_retriever = GetRetriever(integrations, samples)
    variables, keyvals = _flatten_samples(samples, out_file, get_retriever)
    cur_remotes = _get_cur_remotes(keyvals)
    file_estimates = _calc_input_estimates(keyvals, get_retriever)
    out = _cwl_workflow_template(variables)
    parent_wfs = []
    step_parallelism = {}
    steps, wfoutputs = workflow_fn(samples)
    used_inputs = set([])
    for cur in workflow.generate(variables, steps, wfoutputs):
        if cur[0] == "step":
            _, name, parallel, inputs, outputs, image, programs, disk, cores, no_files = cur
            step_file = _write_tool(step_dir, name, inputs, outputs, parallel,
                                    image, programs, file_estimates, disk,
                                    cores, samples, cur_remotes, no_files,
                                    container_tags)
            out["steps"].append(
                _step_template(name, step_file, inputs, outputs, parallel,
                               step_parallelism))
            used_inputs |= set(x["id"] for x in inputs)
        elif cur[0] == "expressiontool":
            _, name, inputs, outputs, expression, parallel = cur
            step_file = _write_expressiontool(step_dir, name, inputs, outputs,
                                              expression, parallel)
            out["steps"].append(
                _step_template(name, step_file, inputs, outputs, parallel,
                               step_parallelism))
            used_inputs |= set(x["id"] for x in inputs)
        elif cur[0] == "upload":
            for output in cur[1]:
                wf_output = copy.deepcopy(output)
                if "outputSource" not in wf_output:
                    wf_output["outputSource"] = wf_output.pop("source")
                wf_output = _clean_record(wf_output)
                out["outputs"].append(wf_output)
        elif cur[0] == "wf_start":
            parent_wfs.append(out)
            out = _cwl_workflow_template(cur[1])
        elif cur[0] == "wf_finish":
            _, name, parallel, inputs, outputs, scatter = cur
            wf_out_file = "wf-%s.cwl" % name
            with open(os.path.join(out_dir, wf_out_file), "w") as out_handle:
                yaml.safe_dump(out,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
            out = parent_wfs.pop(-1)
            out["steps"].append(
                _step_template(name, wf_out_file, inputs, outputs, parallel,
                               step_parallelism, scatter))
            used_inputs |= set(x["id"] for x in inputs)
        else:
            raise ValueError("Unexpected workflow value %s" % str(cur))
        step_parallelism[name] = parallel

    with open(out_file, "w") as out_handle:
        out["inputs"] = [x for x in out["inputs"] if x["id"] in used_inputs]
        yaml.safe_dump(out,
                       out_handle,
                       default_flow_style=False,
                       allow_unicode=False)
    sample_json = "%s-samples.json" % utils.splitext_plus(out_file)[0]
    out_clean = _clean_final_outputs(
        copy.deepcopy({k: v
                       for k, v in keyvals.items()
                       if k in used_inputs}), get_retriever)
    with open(sample_json, "w") as out_handle:
        json.dump(out_clean,
                  out_handle,
                  sort_keys=True,
                  indent=4,
                  separators=(',', ': '))
    return out_file, sample_json
def run_mosdepth(data,
                 target_name,
                 bed_file,
                 per_base=False,
                 quantize=None,
                 thresholds=None):
    """Run mosdepth generating distribution, region depth and per-base depth.
    """
    MosdepthCov = collections.namedtuple(
        "MosdepthCov",
        ("dist", "per_base", "regions", "quantize", "thresholds"))
    bam_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "coverage",
                     dd.get_sample_name(data)))
    prefix = os.path.join(work_dir,
                          "%s-%s" % (dd.get_sample_name(data), target_name))
    old_dist_file = "%s.mosdepth.dist.txt" % (prefix)
    out = MosdepthCov(
        (old_dist_file if utils.file_uptodate(
            old_dist_file, bam_file) else "%s.mosdepth.%s.dist.txt" %
         (prefix, "region" if bed_file else "global")),
        ("%s.per-base.bed.gz" % prefix) if per_base else None,
        ("%s.regions.bed.gz" % prefix) if bed_file else None,
        ("%s.quantized.bed.gz" % prefix) if quantize else None,
        ("%s.thresholds.bed.gz" % prefix) if thresholds else None)
    if not utils.file_uptodate(out.dist, bam_file):
        with file_transaction(data, out.dist) as tx_out_file:
            tx_prefix = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(prefix))
            num_cores = dd.get_cores(data)
            bed_arg = ("--by %s" % bed_file) if bed_file else ""
            perbase_arg = "" if per_base else "--no-per-base"
            mapq_arg = "-Q 1" if (per_base or quantize) else ""
            if quantize:
                quant_arg = "--quantize %s" % quantize[0]
                quant_export = " && ".join([
                    "export MOSDEPTH_Q%s=%s" % (i, x)
                    for (i, x) in enumerate(quantize[1])
                ])
                quant_export += " && "
            else:
                quant_arg, quant_export = "", ""

            thresholds_cmdl = (
                "-T " +
                ",".join([str(t)
                          for t in thresholds])) if out.thresholds else ""
            cmd = (
                "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} "
                "{tx_prefix} {bam_file} {thresholds_cmdl}")
            message = "Calculating coverage: %s %s" % (
                dd.get_sample_name(data), target_name)
            do.run(cmd.format(**locals()), message.format(**locals()))
            if out.per_base:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.per_base)), out.per_base)
            if out.regions:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.regions)), out.regions)
            if out.quantize:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.quantize)), out.quantize)
            if out.thresholds:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.thresholds)),
                    out.thresholds)
    return out
Exemple #56
0
def _make_outdir(config):
    """ make the output directory "fastqc" where the data files live """
    outdir = os.path.join(config["dir"]["results"], "fastqc")
    safe_makedir(outdir)
    return outdir
Exemple #57
0
def start(parallel,
          items,
          config,
          dirs=None,
          name=None,
          multiplier=1,
          max_multicore=None):
    """Start a parallel cluster or machines to be used for running remote
    functions.

    Returns a function used to process, in parallel items with a given function.

    Allows sharing of a single cluster across multiple functions with
    identical resource requirements. Uses local execution for non-distributed
    clusters or completed jobs.

    A checkpoint directory keeps track of finished tasks, avoiding spinning up
    clusters for sections that have been previous processed.

    multiplier - Number of expected jobs per initial input item. Used to avoid
    underscheduling cores when an item is split during processing.
    max_multicore -- The maximum number of cores to use for each process. Can be
    used to process less multicore usage when jobs run faster on more single
    cores.
    """
    if name:
        checkpoint_dir = utils.safe_makedir(
            os.path.join(dirs["work"], "checkpoints_parallel"))
        checkpoint_file = os.path.join(checkpoint_dir, "%s.done" % name)
    else:
        checkpoint_file = None
    sysinfo = system.get_info(dirs, parallel)
    items = [x for x in items if x is not None] if items else []
    max_multicore = int(max_multicore or sysinfo.get("cores", 1))
    parallel = resources.calculate(parallel,
                                   items,
                                   sysinfo,
                                   config,
                                   multiplier=multiplier,
                                   max_multicore=max_multicore)
    try:
        view = None
        if checkpoint_file and os.path.exists(checkpoint_file):
            logger.info("run local -- checkpoint passed: %s" % name)
            parallel["cores_per_job"] = 1
            parallel["num_jobs"] = 1
            parallel["checkpointed"] = True
            yield multi.runner(parallel, config)
        elif parallel["type"] == "ipython":
            with ipython.create(parallel, dirs, config) as view:
                yield ipython.runner(view, parallel, dirs, config)
        elif parallel["type"] == "clusterk":
            with clusterk.create(parallel) as queue:
                yield clusterk.runner(queue, parallel)
        else:
            yield multi.runner(parallel, config)
    except:
        if view is not None:
            ipython.stop(view)
        raise
    else:
        for x in ["cores_per_job", "num_jobs", "mem"]:
            parallel.pop(x, None)
        if checkpoint_file:
            with open(checkpoint_file, "w") as out_handle:
                out_handle.write("done\n")
Exemple #58
0
def run(bam_file, data, out_dir):
    """Run qualimap to assess alignment quality metrics.
    """
    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    resources = config_utils.get_resources("qualimap", data["config"])
    options = " ".join(resources.get("options", ""))
    results_file = os.path.join(results_dir, "genome_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    utils.safe_makedir(results_dir)
    pdf_file = "qualimapReport.pdf"
    if not utils.file_exists(results_file) and not utils.file_exists(
            os.path.join(results_dir, pdf_file)):
        if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"),
                                        data, []):
            logger.info("Full qualimap analysis for %s may be slow." %
                        bam_file)
            ds_bam = bam_file
        else:
            ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
            bam_file = ds_bam if ds_bam else bam_file
        if options.find("PDF") > -1:
            options = "%s -outfile %s" % (options, pdf_file)
        num_cores = data["config"]["algorithm"].get("num_cores", 1)
        qualimap = config_utils.get_program("qualimap", data["config"])
        max_mem = config_utils.adjust_memory(resources.get("memory", "1G"),
                                             num_cores)

        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)

            export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % (
                utils.java_freetype_fix(), utils.local_path_export(), max_mem,
                tx_results_dir)
            cmd = (
                "unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} "
                "--skip-duplicated --skip-dup-mode 0 "
                "-nt {num_cores} {options}")
            species = None
            if (tz.get_in(("genome_resources", "aliases", "human"), data, "")
                    or dd.get_genome_build(data).startswith(("hg", "GRCh"))):
                species = "HUMAN"
            elif dd.get_genome_build(data).startswith(("mm", "GRCm")):
                species = "MOUSE"
            if species in ["HUMAN", "MOUSE"]:
                cmd += " -gd {species}"
            regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [
                None, False, "None"
            ] else dd.get_variant_regions_merged(data))
            if regions:
                regions = bedutils.merge_overlaps(
                    bedutils.clean_file(regions, data), data)
                bed6_regions = _bed_to_bed6(regions, out_dir)
                cmd += " -gff {bed6_regions}"
            bcbio_env = utils.get_bcbio_env()
            do.run(cmd.format(**locals()),
                   "Qualimap: %s" % dd.get_sample_name(data),
                   env=bcbio_env)
            tx_results_file = os.path.join(tx_results_dir,
                                           "genome_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file)
    }
Exemple #59
0
def _merge_target_information(samples):
    metrics_dir = utils.safe_makedir("metrics")
    out_file = os.path.abspath(os.path.join(metrics_dir, "target_info.yaml"))
    if utils.file_exists(out_file):
        return samples

    genomes = set(dd.get_genome_build(data) for data in samples)
    coverage_beds = set(dd.get_coverage(data) for data in samples)
    original_variant_regions = set(dd.get_variant_regions_orig(data) for data in samples)

    data = samples[0]
    info = {}

    # Reporting in MultiQC only if the genome is the same across all samples
    if len(genomes) == 1:
        info["genome_info"] = {
            "name": dd.get_genome_build(data),
            "size": sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]),
        }

    # Reporting in MultiQC only if the target is the same across all samples
    vcr_orig = None
    if len(original_variant_regions) == 1 and list(original_variant_regions)[0] is not None:
        vcr_orig = list(original_variant_regions)[0]
        vcr_clean = bedutils.clean_file(vcr_orig, data)
        info["variants_regions_info"] = {
            "bed": vcr_orig,
            "size": sum(len(x) for x in pybedtools.BedTool(dd.get_variant_regions_merged(data))),
            "regions": pybedtools.BedTool(vcr_clean).count(),
        }
        gene_num = annotate.count_genes(vcr_clean, data)
        if gene_num is not None:
            info["variants_regions_info"]["genes"] = gene_num
    else:
        info["variants_regions_info"] = {
            "bed": "callable regions",
        }
    # Reporting in MultiQC only if the target is the same across samples
    if len(coverage_beds) == 1:
        cov_bed = list(coverage_beds)[0]
        if cov_bed not in [None, "None"]:
            if vcr_orig and vcr_orig == cov_bed:
                info["coverage_bed_info"] = info["variants_regions_info"]
            else:
                clean_bed = bedutils.clean_file(cov_bed, data, prefix="cov-", simple=True)
                info["coverage_bed_info"] = {
                    "bed": cov_bed,
                    "size": pybedtools.BedTool(cov_bed).total_coverage(),
                    "regions": pybedtools.BedTool(clean_bed).count(),
                }
                gene_num = annotate.count_genes(clean_bed, data)
                if gene_num is not None:
                    info["coverage_bed_info"]["genes"] = gene_num
        else:
            info["coverage_bed_info"] = info["variants_regions_info"]

    coverage_intervals = set(data["config"]["algorithm"]["coverage_interval"] for data in samples)
    if len(coverage_intervals) == 1:
        info["coverage_interval"] = list(coverage_intervals)[0]

    if info:
        with open(out_file, "w") as out_handle:
            yaml.safe_dump(info, out_handle)

    return samples
                            write_seq = True
                        else:
                            write_seq = False
                    elif write_seq:
                        out_handle.write(line)
    if not os.path.exists(out_file + ".bwt"):
        subprocess.check_call(["bwa", "index", out_file])
    if not os.path.exists(out_file + ".ndx"):
        subprocess.check_call(["novoindex", out_file + ".ndx", out_file])
    hlas = []
    with open(out_file) as in_handle:
        for line in in_handle:
            if line.startswith(">"):
                hlas.append(line[1:].strip())
    return out_file, hlas

def samples_from_config(sample_yaml):
    with open(sample_yaml) as in_handle:
        config = yaml.safe_load(in_handle)
    by_batch = collections.defaultdict(dict)
    for s in config["details"]:
        by_batch[s["metadata"]["batch"]][s["metadata"]["phenotype"]] = s["description"]
    for bid in sorted(by_batch.keys()):
        yield by_batch[bid]["tumor"], by_batch[bid]["normal"]

if __name__ == "__main__":
    sample_config, hla_fa, cromwell_dir = sys.argv[1:]
    work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "work_lohhla"))
    for t, n in sorted(samples_from_config(sample_config)):
        run_sample(t, n, work_dir, cromwell_dir, hla_fa)