Beispiel #1
0
def unpack_tarballs(xs, data, use_subdir=True):
    """Unpack workflow tarballs into ready to use directories.
    """
    if isinstance(xs, dict):
        for k, v in xs.items():
            xs[k] = unpack_tarballs(v, data, use_subdir)
    elif isinstance(xs, (list, tuple)):
        xs = [unpack_tarballs(x, data, use_subdir) for x in xs]
    elif isinstance(xs, six.string_types):
        if os.path.isfile(xs.encode("utf-8", "ignore")) and xs.endswith("-wf.tar.gz"):
            if use_subdir:
                tarball_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "wf-inputs"))
            else:
                tarball_dir = dd.get_work_dir(data)
            out_dir = os.path.join(tarball_dir,
                                   os.path.basename(xs).replace("-wf.tar.gz", "").replace("--", os.path.sep))
            if not os.path.exists(out_dir):
                with utils.chdir(tarball_dir):
                    with tarfile.open(xs, "r:gz") as tar:
                        tar.extractall()
            assert os.path.exists(out_dir), out_dir
            # Default to representing output directory
            xs = out_dir
            # Look for aligner indices
            for fname in os.listdir(out_dir):
                if fname.endswith(DIR_TARGETS):
                    xs = os.path.join(out_dir, fname)
                    break
                elif fname.endswith(BASENAME_TARGETS):
                    base = os.path.join(out_dir, utils.splitext_plus(os.path.basename(fname))[0])
                    xs = glob.glob("%s*" % base)
                    break
    return xs
Beispiel #2
0
def sample_annotation(data):
    """
    Annotate miRNAs using miRBase database with seqbuster tool
    """
    names = data["rgnames"]['sample']
    tools = dd.get_expression_caller(data)
    work_dir = os.path.join(dd.get_work_dir(data), "mirbase")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = op.join(out_dir, names)
    if dd.get_mirbase_hairpin(data):
        mirbase = op.abspath(op.dirname(dd.get_mirbase_hairpin(data)))
        if utils.file_exists(data["collapse"]):
            data['transcriptome_bam'] = _align(data["collapse"], dd.get_mirbase_hairpin(data), out_file, data)
            data['seqbuster'] = _miraligner(data["collapse"], out_file, dd.get_species(data), mirbase, data['config'])
        else:
            logger.debug("Trimmed collapsed file is empty for %s." % names)
    else:
        logger.debug("No annotation file from miRBase.")

    sps = dd.get_species(data) if dd.get_species(data) else "None"
    logger.debug("Looking for mirdeep2 database for %s" % names)
    if file_exists(op.join(dd.get_work_dir(data), "mirdeep2", "novel", "hairpin.fa")):
        data['seqbuster_novel'] = _miraligner(data["collapse"], "%s_novel" % out_file, sps,  op.join(dd.get_work_dir(data), "mirdeep2", "novel"), data['config'])

    if "trna" in tools:
        data['trna'] = _mint_trna_annotation(data)

    data = spikein.counts_spikein(data)
    return [[data]]
Beispiel #3
0
def gatk_rnaseq_calling(data):
    """Use GATK to perform gVCF variant calling on RNA-seq data
    """
    from bcbio.bam import callable
    data = utils.deepish_copy(data)
    tools_on = dd.get_tools_on(data)
    if not tools_on:
        tools_on = []
    tools_on.append("gvcf")
    data = dd.set_tools_on(data, tools_on)
    data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)])
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                              "variation", "rnaseq", "gatk-haplotype"))
    data = _setup_variant_regions(data, out_dir)
    out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data))
    if not utils.file_exists(out_file):
        region_files = []
        regions = []
        for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data):
            str_region = "_".join([str(x) for x in cur_region])
            region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                    "variation", "rnaseq", "gatk-haplotype",
                                                                    "regions")),
                                    "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region))
            region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {},
                                                region=cur_region, out_file=region_file)
            region_files.append(region_file)
            regions.append(cur_region)
        out_file = vcfutils.concat_variant_files(region_files, out_file, regions,
                                                 dd.get_ref_file(data), data["config"])
    return dd.set_vrn_file(data, out_file)
Beispiel #4
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Beispiel #5
0
def tagcount(data):
    bam = dd.get_transcriptome_bam(data)
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    out_prefix = os.path.join(sample_dir, dd.get_sample_name(data))
    out_file = out_prefix + ".mtx"
    if file_exists(out_file):
        data = dd.set_count_file(data, out_file)
        return [[data]]
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    cutoff = dd.get_minimum_barcode_depth(data)
    cb_histogram = os.path.join(sample_dir, "cb-histogram.txt")
    positional = "--positional" if dd.get_positional_umi(data, False) else ""
    if use_installed_transcriptome(data):
        gtf_file = dd.get_gtf_file(data)
    else:
        gtf_file  = dd.get_transcriptome_gtf(data, None)

    if gtf_file:
        gene_map_file = os.path.join(dd.get_work_dir(data), "annotation",
                                     os.path.splitext(gtf_file)[0] + "-tx2gene.tsv")
        gene_map_file = gtf.tx2genefile(gtf_file, gene_map_file, tsv=True)
        gene_map_flag = " --genemap {0} ".format(gene_map_file)
    else:
        gene_map_flag = ""

    message = "Counting alignments of transcripts in %s." % bam
    cmd = ("{umis} fasttagcount --cb_cutoff {cutoff} "
           "{gene_map_flag} "
           "{positional} "
           "--cb_histogram {cb_histogram}")
    out_files = [out_file, out_file + ".rownames", out_file + ".colnames"]
    umi_matrix_file = out_prefix + "-dupes.mtx"
    out_files += [umi_matrix_file, umi_matrix_file + ".rownames",
                  umi_matrix_file + ".colnames"]
    if has_umi_matrix(data):
        umi_matrix_flag = " --umi_matrix {tx_umi_matrix_full} "
    else:
        umi_matrix_flag = ""
    cmd += umi_matrix_flag
    cmd += " {bam} {tx_out_file_full}"
    with file_transaction(out_files) as tx_out_files:
        tx_out_file = tx_out_files[0]
        tx_out_file_full = tx_out_file + ".full"
        tx_umi_matrix = tx_out_files[3]
        tx_umi_matrix_full = tx_out_files[3] + ".full"
        do.run(cmd.format(**locals()), message)
        cmd = ("{umis} sparse {tx_out_file_full} {tx_out_file}")
        message = "Converting %s to sparse format." % tx_out_file_full
        do.run(cmd.format(**locals()), message)
        if has_umi_matrix(data):
            cmd = ("{umis} sparse {tx_umi_matrix_full} {tx_umi_matrix}")
            message = "Converting %s to sparse format." % tx_umi_matrix_full
        do.run(cmd.format(**locals()), message)
    data = dd.set_count_file(data, out_file)
    return [[data]]
def _symlink_to_workdir(data, key):
    """For CWL support, symlink files into a working directory if in read-only imports.
    """
    orig_file = tz.get_in(key, data)
    if orig_file and not orig_file.startswith(dd.get_work_dir(data)):
        variantcaller = genotype.get_variantcaller(data)
        out_file = os.path.join(dd.get_work_dir(data), variantcaller, os.path.basename(orig_file))
        utils.safe_makedir(os.path.dirname(out_file))
        utils.symlink_plus(orig_file, out_file)
        data = tz.update_in(data, key, lambda x: out_file)
    return data
Beispiel #7
0
def run_cluster(data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    out_dir = os.path.join(dd.get_work_dir(data[0]), "seqcluster", "cluster")
    out_dir = os.path.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(dd.get_work_dir(data[0]), "seqcluster", "prepare")
    bam_file = op.join(dd.get_work_dir(data[0]), "align", "seqs.bam")
    cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(data[0]), dd.get_srna_gtf_file(data[0]))
    for sample in data:
        sample["seqcluster"] = out_dir
    return [data]
Beispiel #8
0
def filter_barcodes(data):
    # if data was pre-demultiplexed, there is no need to filter the barcodes
    if dd.get_demultiplexed(data):
        return [[data]]
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        logger.info("No cellular barcodes found, skipping filtering.")
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, six.string_types):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
 def __init__(self, data):
     self._db_location = self._get_ericscript_db(data)
     self._sample_name = dd.get_lane(data)
     self._work_dir = dd.get_work_dir(data)
     self._env = None
     self._output_dir = None
     self._sample_out_dir = None
Beispiel #10
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}")
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
Beispiel #11
0
def _make_isomir_counts(data, srna_type="seqbuster", out_dir=None, stem=""):
    """
    Parse miraligner files to create count matrix.
    """
    work_dir = dd.get_work_dir(data[0][0])
    if not out_dir:
        out_dir = op.join(work_dir, "mirbase")
    out_novel_isomir = append_stem(op.join(out_dir, "counts.tsv"), stem)
    out_novel_mirna = append_stem(op.join(out_dir, "counts_mirna.tsv"), stem)
    logger.debug("Create %s count data at %s." % (srna_type, out_dir))
    if file_exists(out_novel_mirna):
        return [out_novel_mirna, out_novel_isomir]
    out_dts = []
    for sample in data:
        if sample[0].get(srna_type):
            miraligner_fn = sample[0][srna_type]
            reads = _read_miraligner(miraligner_fn)
            if reads:
                out_file, dt, dt_pre = _tab_output(reads, miraligner_fn + ".back", dd.get_sample_name(sample[0]))
                out_dts.append(dt)
            else:
                logger.debug("WARNING::%s has NOT miRNA annotated for %s. Check if fasta files is small or species value." % (dd.get_sample_name(sample[0]), srna_type))
    if out_dts:
        out_files = _create_counts(out_dts, out_dir)
        out_files = [move_safe(out_files[0], out_novel_isomir), move_safe(out_files[1], out_novel_mirna)]
        return out_files
    else:
        logger.debug("WARNING::any samples have miRNA annotated for %s. Check if fasta files is small or species value." % srna_type)
Beispiel #12
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample))
    sample["report"] = _report(sample, dd.get_ref_file(sample))
    sample["seqcluster"] = out_dir

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel")
    novel_db = mirdeep.run(data)
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
Beispiel #13
0
def _maybe_add_salmon_files(algorithm, sample, out):
    salmon_dir = os.path.join(dd.get_work_dir(sample), "salmon", dd.get_sample_name(sample), "quant")
    if os.path.exists(salmon_dir):
        out.append({"path": salmon_dir,
                    "type": "directory",
                    "ext": "salmon"})
    return out
Beispiel #14
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. "
                   % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
Beispiel #15
0
def run_prepare(*data):
    """
    Run seqcluster prepare to merge all samples in one file
    """
    out_dir = os.path.join(dd.get_work_dir(data[0][0]), "seqcluster", "prepare")
    out_dir = os.path.abspath(safe_makedir(out_dir))
    prepare_dir = os.path.join(out_dir, "prepare")
    tools = dd.get_expression_caller(data[0][0])
    if len(tools) == 0:
        logger.info("You didn't specify any other expression caller tool."
                       "You can add to the YAML file:"
                       "expression_caller:[trna, seqcluster, mirdeep2]")
    fn = []
    for sample in data:
        name = sample[0]["rgnames"]['sample']
        fn.append("%s\t%s" % (sample[0]['collapse'], name))
    args = namedtuple('args', 'debug print_debug minc minl maxl out')
    args = args(False, False, 2, 17, 40, out_dir)
    ma_out = op.join(out_dir, "seqs.ma")
    seq_out = op.join(out_dir, "seqs.fastq")
    min_shared = max(int(len(fn) / 10.0), 1)
    if not file_exists(ma_out):
        seq_l, sample_l = prepare._read_fastq_files(fn, args)
        with file_transaction(ma_out) as ma_tx:
            with open(ma_tx, 'w') as ma_handle:
                with open(seq_out, 'w') as seq_handle:
                    prepare._create_matrix_uniq_seq(sample_l, seq_l, ma_handle, seq_handle, min_shared)

    for sample in data:
        sample[0]["seqcluster_prepare_ma"] = ma_out
        sample[0]["seqcluster_prepare_fastq"] = seq_out
    return data
Beispiel #16
0
def run_cluster(*data):
    """
    Run seqcluster cluster to detect smallRNA clusters
    """
    sample = data[0][0]
    tools = dd.get_expression_caller(data[0][0])
    work_dir = dd.get_work_dir(sample)
    out_dir = op.join(work_dir, "seqcluster", "cluster")
    out_dir = op.abspath(safe_makedir(out_dir))
    prepare_dir = op.join(work_dir, "seqcluster", "prepare")
    bam_file = data[0][0]["work_bam"]
    if "seqcluster" in tools:
        sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample))
        sample["report"] = _report(sample, dd.get_ref_file(sample))

    out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase"))
    if out_mirna:
        sample = dd.set_mirna_counts(sample, out_mirna[0])
        sample = dd.set_isomir_counts(sample, out_mirna[1])

    out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel")
    if out_novel:
        sample = dd.set_novel_mirna_counts(sample, out_novel[0])
        sample = dd.set_novel_isomir_counts(sample, out_novel[1])
    data[0][0] = sample
    return data
Beispiel #17
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Beispiel #18
0
def _merge_fastqc(samples):
    """
    merge all fastqc samples into one by module
    """
    fastqc_list = collections.defaultdict(list)
    seen = set()
    for data in samples:
        name = dd.get_sample_name(data)
        if name in seen:
            continue
        seen.add(name)
        fns = glob.glob(os.path.join(dd.get_work_dir(data), "qc", dd.get_sample_name(data), "fastqc") + "/*")
        for fn in fns:
            if fn.endswith("tsv"):
                metric = os.path.basename(fn)
                fastqc_list[metric].append([name, fn])

    for metric in fastqc_list:
        dt_by_sample = []
        for fn in fastqc_list[metric]:
            dt = pd.read_csv(fn[1], sep="\t")
            dt['sample'] = fn[0]
            dt_by_sample.append(dt)
        dt = utils.rbind(dt_by_sample)
        dt.to_csv(metric, sep="\t", index=False, mode ='w')
    return samples
Beispiel #19
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Beispiel #20
0
def _mint_trna_annotation(data):
    """
    use MINTmap to quantify tRNAs
    """
    trna_lookup = op.join(dd.get_srna_mint_lookup(data))
    trna_space = op.join(dd.get_srna_mint_space(data))
    trna_other = op.join(dd.get_srna_mint_other(data))
    name = dd.get_sample_name(data)
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "trna_mint", name))
    in_file = op.basename(data["clean_fastq"])
    mintmap = os.path.realpath(os.path.join(os.path.dirname(sys.executable), "MINTmap.pl"))
    perl_export = utils.get_perl_exports()
    if not file_exists(trna_lookup) or not file_exists(mintmap):
        logger.info("There is no tRNA annotation to run MINTmap.")
        return work_dir
    jar_folder = os.path.join(os.path.dirname(mintmap), "MINTplates")
    out_file = op.join(work_dir, name + "-MINTmap_v1-exclusive-tRFs.expression.txt")
    if not file_exists(out_file):
        with tx_tmpdir(data) as txdir:
            with utils.chdir(txdir):
                utils.symlink_plus(data["clean_fastq"], op.join(txdir, in_file))
                cmd = ("{perl_export} && {mintmap} -f {in_file} -p {name} "
                       "-l {trna_lookup} -s {trna_space} -j {jar_folder} "
                       "-o {trna_other}").format(**locals())
                do.run(cmd, "tRNA for %s" % name)
                for filename in glob.glob("*MINTmap*"):
                    shutil.move(filename, work_dir)
    return work_dir
Beispiel #21
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0])))
    species = dd.get_species(data[0][0])
    hairpin = op.join(mirbase, "hairpin.fa")
    mature = op.join(mirbase, "mature.fa")
    rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file):
            do.run(cmd.format(**locals()), "Running mirdeep2.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Beispiel #22
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = utils.to_single_data(data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(bam_file_ready, ref_file, data)
        sample_callable = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        offtarget_stats = callable.calculate_offtarget(bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed, "callable": callable_bed,
                           "sample_callable": sample_callable,
                           "offtarget_stats": offtarget_stats}
        data = coverage.assign_interval(data)
        highdepth_bed = highdepth.identify(data)
        data["regions"]["highdepth"] = highdepth_bed
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = bedutils.clean_inputs(data)
        data = _recal_no_markduplicates(data)
    return [[data]]
Beispiel #23
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage"))
    if not bed_file:
        return data
    cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed")
    cleaned_bed = bed.decomment(bed_file, cleaned_bed)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        _calculate_percentiles(os.path.abspath(parse_file), sample)
        data['coverage'] = os.path.abspath(parse_file)
    return data
Beispiel #24
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if trim_reads:
        adapter = dd.get_adapters(data)[0]
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        cutadapt = os.path.join(os.path.dirname(sys.executable), "cutadapt")
        cmd = _cmd_cutadapt()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter")
    else:
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Beispiel #25
0
def postprocess_alignment(data):
    """Perform post-processing steps required on full BAM files.
    Prepares list of callable genome regions allowing subsequent parallelization.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    bam_file = data.get("align_bam") or data.get("work_bam")
    if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith(".bam"):
        ref_file = dd.get_ref_file(data)
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                  dd.get_sample_name(data)))
        bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file))
        if not utils.file_exists(bam_file_ready):
            utils.symlink_plus(bam_file, bam_file_ready)
        bam.index(bam_file_ready, data["config"])
        covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data)
        callable_region_bed, nblock_bed, callable_bed = \
            callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data)
        data["regions"] = {"nblock": nblock_bed,
                           "callable": callable_bed,
                           "sample_callable": covinfo.callable,
                           "mapped_stats": readstats.get_cache_file(data)}
        data["depth"] = covinfo.depth_files
        data = coverage.assign_interval(data)
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
            data = clean_inputs(data)
        data = recalibrate.prep_recal(data)
        data = recalibrate.apply_recal(data)
    return [[data]]
def report_summary(samples, run_parallel):
    """
    Run coverage report with bcbiocov package
    """
    work_dir = dd.get_work_dir(samples[0][0])

    parent_dir = utils.safe_makedir(os.path.join(work_dir, "report"))
    qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma")
    with utils.chdir(parent_dir):

        logger.info("copy qsignature")
        if qsignature_fn:
            if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        out_dir = utils.safe_makedir("coverage")
        out_dir = utils.safe_makedir("variants")
        samples = run_parallel("coverage_report", samples)

        try:
            import bcbreport.prepare as bcbreport
            bcbreport.report(parent_dir)
        except:
            logger.info("skipping report. No bcbreport installed.")
            pass

        logger.info("summarize metrics")
        samples = _merge_metrics(samples)

    return samples
Beispiel #27
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
            except:
                logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Beispiel #28
0
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(sailfish_dir, "quant")
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    build_string = get_build_string(data)
    sailfish_idx = os.path.join(dd.get_work_dir(data), "sailfish", "index",
                                build_string)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, quant_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
        _sleuthify_sailfish(tx_out_dir)
    return out_file
Beispiel #29
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Beispiel #30
0
def _merge_hla_fastq_inputs(data):
    """Merge HLA inputs from a split initial alignment.
    """
    hla_key = ["hla", "fastq"]
    hla_sample_files = [x for x in tz.get_in(hla_key, data, []) if x and x != "None"]
    if hla_sample_files:
        out_files = collections.defaultdict(list)
        for hla_files in hla_sample_files:
            for hla_file in hla_files:
                rehla = re.search(".hla.(?P<hlatype>[\w-]+).fq", hla_file)
                if rehla:
                    hlatype = rehla.group("hlatype")
                    out_files[hlatype].append(hla_file)
        if len(out_files) > 0:
            hla_outdir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align",
                                                         dd.get_sample_name(data), "hla"))
            merged_hlas = []
            for hlatype, files in out_files.items():
                out_file = os.path.join(hla_outdir, "%s-%s.fq" % (dd.get_sample_name(data), hlatype))
                optitype.combine_hla_fqs([(hlatype, f) for f in files], out_file, data)
                merged_hlas.append(out_file)
        data = tz.update_in(data, hla_key, lambda x: merged_hlas)
    else:
        data = tz.update_in(data, hla_key, lambda x: None)
    return data
Beispiel #31
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    if dd.get_demultiplexed(data):
        demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data)
        split_option = ""
    else:
        demuxed_option = ""
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]
    locale_export = utils.locale_export()
    umis = _umis_cmd(data)
    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} {demuxed_option} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #32
0
def get_kallisto_fusions(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "quant")
    return os.path.join(kallisto_dir, "fusion.txt")
Beispiel #33
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    data = umi_transform(data)
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    log_out = os.path.join(out_dir, "%s.log" % names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["files"][0] = out_file
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        data["log_trimming"] = log_out
        return [[data]]

    adapter = dd.get_adapters(data)
    is_4n = any([a == "4N" for a in adapter])
    adapter = [a for a in adapter if re.compile("^([NATGC]+)$").match(a)]
    if adapter and not trim_reads:
        trim_reads = True
        logger.info(
            "Adapter is set up in config file, but trim_reads is not true."
            "If you want to skip trimming, skip adapter option from config.")
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    if trim_reads:
        adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if not trim_reads or len(
        adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        if any([a for a in adapters if re.compile("^N+$").match(a)]):
            adapter_cmd = "-N %s" % adapter_cmd
        out_noadapter_file = replace_directory(
            append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"),
                                           out_dir)
        # atropos = _get_atropos()
        atropos = config_utils.get_program("atropos", data, default="atropos")
        options = " ".join(
            data.get('resources', {}).get('atropos', {}).get("options", ""))
        if options.strip() == "-u 4 -u -4":
            options = ""
            is_4n = "4N"
        cores = ("--threads %s" %
                 dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(
                data.get('resources', {}).get('cutadapt',
                                              {}).get("options", "")):
            raise ValueError(
                "Atropos is now used, but cutadapt options found in YAML file."
                "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(
                        out_short_file, names)
                    open(log_out, 'w').write(content)
                if is_4n:
                    options = "-u 4 -u -4"
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(
                        cmd.format(**locals()),
                        "atropos with this parameters %s for %s" %
                        (options, names))
        data["log_trimming"] = log_out
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["files"][0] = out_file
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
def _get_files_project(sample, upload_config):
    """Retrieve output files associated with an entire analysis project.
    """
    out = [{"path": sample["provenance"]["programs"]}]
    if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""):
        out.append({"path": sample["provenance"]["data"]})
    for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]:
        if os.path.exists(os.path.join(log.get_log_dir(sample["config"]), fname)):
            out.append({"path": os.path.join(log.get_log_dir(sample["config"]), fname),
                        "type": "external_command_log",
                        "ext": ""})

    if "summary" in sample and sample["summary"].get("project"):
        out.append({"path": sample["summary"]["project"]})
    if "summary" in sample and sample["summary"].get("metadata"):
        out.append({"path": sample["summary"]["metadata"]})
    mixup_check = tz.get_in(["summary", "mixup_check"], sample)
    if mixup_check:
        out.append({"path": sample["summary"]["mixup_check"],
                    "type": "directory", "ext": "mixup_check"})

    report = os.path.join(dd.get_work_dir(sample), "report")
    if utils.file_exists(report):
        out.append({"path": report,
                    "type": "directory", "ext": "report"})

    multiqc = tz.get_in(["summary", "multiqc"], sample)
    if multiqc:
        out.extend(_flatten_file_with_secondary(multiqc, "multiqc"))

    ataqv = tz.get_in(["ataqv_report"], sample)
    if ataqv:
        out.extend(_flatten_file_with_secondary(ataqv, "ataqv"))

    if sample.get("seqcluster", {}):
        out.append({"path": sample["seqcluster"].get("out_dir"),
                    "type": "directory", "ext": "seqcluster"})

    if sample.get("mirge", {}):
        for fn in sample["mirge"]:
            out.append({"path": fn,
                        "dir": "mirge"})

    if sample.get("report", None):
        out.append({"path": os.path.dirname(sample["report"]),
                    "type": "directory", "ext": "seqclusterViz"})

    for x in sample.get("variants", []):
        if "pop_db" in x:
            out.append({"path": x["pop_db"],
                        "type": "sqlite",
                        "variantcaller": x["variantcaller"]})
    for x in sample.get("variants", []):
        if "population" in x:
            pop_db = tz.get_in(["population", "db"], x)
            if pop_db:
                out.append({"path": pop_db,
                            "type": "sqlite",
                            "variantcaller": x["variantcaller"]})
            suffix = "-annotated-decomposed" if tz.get_in(("population", "decomposed"), x) else "-annotated"
            vcfs = _get_project_vcf(x, suffix)
            out.extend([_add_batch(f, sample) for f in vcfs])
    for x in sample.get("variants", []):
        if x.get("validate") and x["validate"].get("grading_summary"):
            out.append({"path": x["validate"]["grading_summary"]})
            break
    sv_project = set([])
    for svcall in sample.get("sv", []):
        if svcall.get("variantcaller") == "seq2c":
            if svcall.get("calls_all") and svcall["calls_all"] not in sv_project:
                out.append({"path": svcall["coverage_all"], "batch": "seq2c", "ext": "coverage", "type": "tsv"})
                out.append({"path": svcall["read_mapping"], "batch": "seq2c", "ext": "read_mapping", "type": "txt"})
                out.append({"path": svcall["calls_all"], "batch": "seq2c", "ext": "calls", "type": "tsv"})
                sv_project.add(svcall["calls_all"])
    if "coverage" in sample:
        cov_db = tz.get_in(["coverage", "summary"], sample)
        if cov_db:
            out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"})
        all_coverage = tz.get_in(["coverage", "all"], sample)
        if all_coverage:
            out.append({"path": all_coverage, "type": "bed", "ext": "coverage"})

    if dd.get_mirna_counts(sample):
        out.append({"path": dd.get_mirna_counts(sample)})
    if dd.get_isomir_counts(sample):
        out.append({"path": dd.get_isomir_counts(sample)})
    if dd.get_novel_mirna_counts(sample):
        out.append({"path": dd.get_novel_mirna_counts(sample)})
    if dd.get_novel_isomir_counts(sample):
        out.append({"path": dd.get_novel_isomir_counts(sample)})
    if dd.get_combined_counts(sample):
        count_file = dd.get_combined_counts(sample)
        if sample["analysis"].lower() == "scrna-seq":
            out.append({"path": count_file,
                        "type": "mtx"})
            out.append({"path": count_file + ".rownames",
                        "type": "rownames"})
            out.append({"path": count_file + ".colnames",
                        "type": "colnames"})
            out.append({"path": count_file + ".metadata",
                        "type": "metadata"})
            umi_file = os.path.splitext(count_file)[0] + "-dupes.mtx"
            if utils.file_exists(umi_file):
                out.append({"path": umi_file,
                            "type": "mtx"})
                out.append({"path": umi_file + ".rownames",
                            "type": "rownames"})
                out.append({"path": umi_file + ".colnames",
                            "type": "colnames"})
            if dd.get_combined_histogram(sample):
                out.append({"path": dd.get_combined_histogram(sample),
                            "type": "txt"})
            rda = os.path.join(os.path.dirname(count_file), "se.rda")
            if utils.file_exists(rda):
                out.append({"path": rda,
                            "type": "rda"})
        else:
            out.append({"path": dd.get_combined_counts(sample)})
    if dd.get_tximport(sample):
        out.append({"path": dd.get_tximport(sample)["gene_tpm"], "dir": "tpm"})
        out.append({"path": dd.get_tximport(sample)["gene_counts"], "dir": "counts"})
    if dd.get_annotated_combined_counts(sample):
        out.append({"path": dd.get_annotated_combined_counts(sample)})
    if dd.get_combined_fpkm(sample):
        out.append({"path": dd.get_combined_fpkm(sample)})
    if dd.get_combined_fpkm_isoform(sample):
        out.append({"path": dd.get_combined_fpkm_isoform(sample)})
    if dd.get_transcript_assembler(sample):
        out.append({"path": dd.get_merged_gtf(sample)})
    if dd.get_dexseq_counts(sample):
        out.append({"path": dd.get_dexseq_counts(sample)})
        out.append({"path": "%s.ann" % dd.get_dexseq_counts(sample)})
    if dd.get_express_counts(sample):
        out.append({"path": dd.get_express_counts(sample)})
    if dd.get_express_fpkm(sample):
        out.append({"path": dd.get_express_fpkm(sample)})
    if dd.get_express_tpm(sample):
        out.append({"path": dd.get_express_tpm(sample)})
    if dd.get_isoform_to_gene(sample):
        out.append({"path": dd.get_isoform_to_gene(sample)})
    if dd.get_square_vcf(sample):
        out.append({"path": dd.get_square_vcf(sample)})
    if dd.get_sailfish_transcript_tpm(sample):
        out.append({"path": dd.get_sailfish_transcript_tpm(sample)})
    if dd.get_sailfish_gene_tpm(sample):
        out.append({"path": dd.get_sailfish_gene_tpm(sample)})
    if dd.get_tx2gene(sample):
        out.append({"path": dd.get_tx2gene(sample)})
    if dd.get_spikein_counts(sample):
        out.append({"path": dd.get_spikein_counts(sample)})
    if tz.get_in(("peaks_files", "consensus", "main"), sample):
        out.append({"path": tz.get_in(("peaks_files", "consensus", "main"), sample), "dir": "consensus"})
    if tz.get_in(("peak_counts", "peaktable"), sample):
        out.append({"path": tz.get_in(("peak_counts", "peaktable"), sample), "dir": "consensus"})

    transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs",
                                     "transcriptome")
    if os.path.exists(transcriptome_dir):
        out.append({"path": transcriptome_dir, "type": "directory",
                    "ext": "transcriptome"})
    return _add_meta(out, config=upload_config)
Beispiel #35
0
def _get_cache_file(data, target_name):
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    cache_file = prefix + "-" + target_name + "-stats.yaml"
    return cache_file
Beispiel #36
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Beispiel #37
0
def combine_files(samples):
    """
    after quantitation, combine the counts/FPKM/TPM/etc into a single table with
    all samples
    """
    data = samples[0][0]
    # prefer the supplied transcriptome gtf file
    gtf_file = dd.get_transcriptome_gtf(data, None)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data, None)
    dexseq_gff = dd.get_dexseq_gff(data)

    # combine featureCount files
    count_files = filter_missing([dd.get_count_file(x[0]) for x in samples])
    combined = count.combine_count_files(count_files, ext=".counts")
    annotated = count.annotate_combined_count_file(combined, gtf_file)

    # add tx2gene file
    tx2gene_file = os.path.join(dd.get_work_dir(data), "annotation",
                                "tx2gene.csv")
    if gtf_file:
        tx2gene_file = sailfish.create_combined_tx2gene(data)

    # combine eXpress files
    express_counts_combined = combine_express(samples, combined)

    # combine Cufflinks files
    fpkm_files = filter_missing([dd.get_fpkm(x[0]) for x in samples])
    if fpkm_files and combined:
        fpkm_combined_file = os.path.splitext(combined)[0] + ".fpkm"
        fpkm_combined = count.combine_count_files(fpkm_files,
                                                  fpkm_combined_file)
    else:
        fpkm_combined = None
    isoform_files = filter_missing(
        [dd.get_fpkm_isoform(x[0]) for x in samples])
    if isoform_files and combined:
        fpkm_isoform_combined_file = os.path.splitext(
            combined)[0] + ".isoform.fpkm"
        fpkm_isoform_combined = count.combine_count_files(
            isoform_files, fpkm_isoform_combined_file, ".isoform.fpkm")
    else:
        fpkm_isoform_combined = None
    # combine DEXseq files
    to_combine_dexseq = list(
        filter_missing([dd.get_dexseq_counts(data[0]) for data in samples]))
    if to_combine_dexseq and combined:
        dexseq_combined_file = os.path.splitext(combined)[0] + ".dexseq"
        dexseq_combined = count.combine_count_files(to_combine_dexseq,
                                                    dexseq_combined_file,
                                                    ".dexseq")
        if dexseq_combined:
            dexseq.create_dexseq_annotation(dexseq_gff, dexseq_combined)
    else:
        dexseq_combined = None
    samples = spikein.combine_spikein(samples)
    tximport = load_tximport(data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        if combined:
            data = dd.set_combined_counts(data, combined)
        if annotated:
            data = dd.set_annotated_combined_counts(data, annotated)
        if fpkm_combined:
            data = dd.set_combined_fpkm(data, fpkm_combined)
        if fpkm_isoform_combined:
            data = dd.set_combined_fpkm_isoform(data, fpkm_isoform_combined)
        if express_counts_combined:
            data = dd.set_express_counts(data,
                                         express_counts_combined['counts'])
            data = dd.set_express_tpm(data, express_counts_combined['tpm'])
            data = dd.set_express_fpkm(data, express_counts_combined['fpkm'])
            data = dd.set_isoform_to_gene(
                data, express_counts_combined['isoform_to_gene'])
        if dexseq_combined:
            data = dd.set_dexseq_counts(data, dexseq_combined_file)
        if gtf_file:
            data = dd.set_tx2gene(data, tx2gene_file)
        data = dd.set_tximport(data, tximport)
        updated_samples.append([data])
    return updated_samples
Beispiel #38
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = list(utils.flatten(samples))
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    out_dir = utils.safe_makedir(os.path.join(work_dir, "qc", "multiqc"))
    out_data = os.path.join(out_dir, "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    file_list = os.path.join(out_dir, "list_files.txt")
    work_samples = cwlutils.unpack_tarballs(
        [utils.deepish_copy(x) for x in samples], samples[0])
    work_samples = _summarize_inputs(work_samples, out_dir)
    if not utils.file_exists(out_file):
        with tx_tmpdir(samples[0], work_dir) as tx_out:
            in_files = _get_input_files(work_samples, out_dir, tx_out)
            in_files += _merge_metrics(work_samples, out_dir)
            if _one_exists(in_files):
                with utils.chdir(out_dir):
                    config_file = _create_config_file(out_dir, work_samples)
                    input_list_file = _create_list_file(in_files, file_list)
                    if dd.get_tmp_dir(samples[0]):
                        export_tmp = "export TMPDIR=%s && " % dd.get_tmp_dir(
                            samples[0])
                    else:
                        export_tmp = ""
                    locale_export = utils.locale_export()
                    path_export = utils.local_path_export()
                    other_opts = config_utils.get_resources(
                        "multiqc", samples[0]["config"]).get("options", [])
                    other_opts = " ".join([str(x) for x in other_opts])
                    cmd = (
                        "{path_export}{export_tmp}{locale_export} "
                        "{multiqc} -c {config_file} -f -l {input_list_file} {other_opts} -o {tx_out}"
                    )
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    samples = _group_by_sample_and_batch(samples)
    if utils.file_exists(out_file) and samples:
        data_files = set()
        for i, data in enumerate(samples):
            data_files.add(
                os.path.join(out_dir, "report", "metrics",
                             dd.get_sample_name(data) + "_bcbio.txt"))
        data_files.add(
            os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
        data_files.add(os.path.join(out_dir, "multiqc_config.yaml"))
        [
            data_files.add(f)
            for f in glob.glob(os.path.join(out_dir, "multiqc_data", "*"))
        ]
        data_files = [f for f in data_files if f and utils.file_exists(f)]
        if "summary" not in samples[0]:
            samples[0]["summary"] = {}
        samples[0]["summary"]["multiqc"] = {
            "base": out_file,
            "secondary": data_files
        }

        data_json = os.path.join(out_dir, "multiqc_data", "multiqc_data.json")
        data_json_final = _save_uploaded_data_json(
            samples, data_json, os.path.join(out_dir, "multiqc_data"))
        if data_json_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                data_json_final)

        # Prepare final file list and inputs for downstream usage
        file_list_final = _save_uploaded_file_list(samples, file_list, out_dir)
        if file_list_final:
            samples[0]["summary"]["multiqc"]["secondary"].append(
                file_list_final)
            if any([cwlutils.is_cwl_run(d) for d in samples]):
                for indir in ["inputs", "report"]:
                    tarball = os.path.join(out_dir,
                                           "multiqc-%s.tar.gz" % (indir))
                    if not utils.file_exists(tarball):
                        with utils.chdir(out_dir):
                            cmd = ["tar", "-czvpf", tarball, indir]
                            do.run(cmd, "Compress multiqc inputs: %s" % indir)
                    samples[0]["summary"]["multiqc"]["secondary"].append(
                        tarball)

    if any([cwlutils.is_cwl_run(d) for d in samples]):
        samples = _add_versions(samples)

    return [[data] for data in samples]
Beispiel #39
0
def _maybe_add_salmon_files(algorithm, sample, out):
    salmon_dir = os.path.join(dd.get_work_dir(sample), "salmon",
                              dd.get_sample_name(sample))
    if os.path.exists(salmon_dir):
        out.append({"path": salmon_dir, "type": "directory", "ext": "salmon"})
    return out
Beispiel #40
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1 = data["files"][0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)

    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return data
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed. Assuming non-umi data." % fq1)
            return data

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return data
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return data

    cmd = ("{umis} fastqtransform {transform_file} "
           "--cores {cores} "
           "{fq1}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return data
Beispiel #41
0
def get_kallisto_h5(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "quant")
    return os.path.join(kallisto_dir, "abundance.h5")
Beispiel #42
0
def _get_files_project(sample, upload_config):
    """Retrieve output files associated with an entire analysis project.
    """
    out = [{"path": sample["provenance"]["programs"]}]
    if os.path.exists(tz.get_in(["provenance", "data"], sample) or ""):
        out.append({"path": sample["provenance"]["data"]})
    for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]:
        if os.path.exists(
                os.path.join(log.get_log_dir(sample["config"]), fname)):
            out.append({
                "path":
                os.path.join(log.get_log_dir(sample["config"]), fname),
                "type":
                "external_command_log",
                "ext":
                ""
            })

    if "summary" in sample and sample["summary"].get("project"):
        out.append({"path": sample["summary"]["project"]})
    mixup_check = tz.get_in(["summary", "mixup_check"], sample)
    if mixup_check:
        out.append({
            "path": sample["summary"]["mixup_check"],
            "type": "directory",
            "ext": "mixup_check"
        })

    report = os.path.join(dd.get_work_dir(sample), "report")
    if utils.file_exists(report):
        out.append({"path": report, "type": "directory", "ext": "report"})

    multiqc = tz.get_in(["summary", "multiqc"], sample)
    if multiqc:
        out.extend(_flatten_file_with_secondary(multiqc, "multiqc"))

    if sample.get("seqcluster", {}):
        out.append({
            "path": sample["seqcluster"].get("out_dir"),
            "type": "directory",
            "ext": "seqcluster"
        })

    if sample.get("report", None):
        out.append({
            "path": os.path.dirname(sample["report"]),
            "type": "directory",
            "ext": "seqclusterViz"
        })

    for x in sample.get("variants", []):
        if "pop_db" in x:
            out.append({
                "path": x["pop_db"],
                "type": "sqlite",
                "variantcaller": x["variantcaller"]
            })
    for x in sample.get("variants", []):
        if "population" in x:
            pop_db = tz.get_in(["population", "db"], x)
            if pop_db:
                out.append({
                    "path": pop_db,
                    "type": "sqlite",
                    "variantcaller": x["variantcaller"]
                })
            suffix = "-annotated-decomposed" if tz.get_in(
                ("population", "decomposed"), x) else "-annotated"
            out.extend([
                _add_batch(x, sample)
                for x in _get_variant_file(x, ("population", "vcf"),
                                           suffix=suffix)
            ])
    for x in sample.get("variants", []):
        if x.get("validate") and x["validate"].get("grading_summary"):
            out.append({"path": x["validate"]["grading_summary"]})
            break
    if "coverage" in sample:
        cov_db = tz.get_in(["coverage", "summary"], sample)
        if cov_db:
            out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"})
        all_coverage = tz.get_in(["coverage", "all"], sample)
        if all_coverage:
            out.append({
                "path": all_coverage,
                "type": "bed",
                "ext": "coverage"
            })

    if dd.get_mirna_counts(sample):
        out.append({"path": dd.get_mirna_counts(sample)})
    if dd.get_isomir_counts(sample):
        out.append({"path": dd.get_isomir_counts(sample)})
    if dd.get_novel_mirna_counts(sample):
        out.append({"path": dd.get_novel_mirna_counts(sample)})
    if dd.get_novel_isomir_counts(sample):
        out.append({"path": dd.get_novel_isomir_counts(sample)})
    if dd.get_combined_counts(sample):
        count_file = dd.get_combined_counts(sample)
        if sample["analysis"].lower() == "scrna-seq":
            out.append({"path": count_file, "type": "mtx"})
            out.append({"path": count_file + ".rownames", "type": "rownames"})
            out.append({"path": count_file + ".colnames", "type": "colnames"})
        else:
            out.append({"path": dd.get_combined_counts(sample)})
    if dd.get_annotated_combined_counts(sample):
        out.append({"path": dd.get_annotated_combined_counts(sample)})
    if dd.get_combined_fpkm(sample):
        out.append({"path": dd.get_combined_fpkm(sample)})
    if dd.get_combined_fpkm_isoform(sample):
        out.append({"path": dd.get_combined_fpkm_isoform(sample)})
    if dd.get_transcript_assembler(sample):
        out.append({"path": dd.get_merged_gtf(sample)})
    if dd.get_dexseq_counts(sample):
        out.append({"path": dd.get_dexseq_counts(sample)})
    if dd.get_express_counts(sample):
        out.append({"path": dd.get_express_counts(sample)})
    if dd.get_express_fpkm(sample):
        out.append({"path": dd.get_express_fpkm(sample)})
    if dd.get_express_tpm(sample):
        out.append({"path": dd.get_express_tpm(sample)})
    if dd.get_isoform_to_gene(sample):
        out.append({"path": dd.get_isoform_to_gene(sample)})
    if dd.get_square_vcf(sample):
        out.append({"path": dd.get_square_vcf(sample)})
    if dd.get_sailfish_transcript_tpm(sample):
        out.append({"path": dd.get_sailfish_transcript_tpm(sample)})
    if dd.get_sailfish_gene_tpm(sample):
        out.append({"path": dd.get_sailfish_gene_tpm(sample)})
    if dd.get_tx2gene(sample):
        out.append({"path": dd.get_tx2gene(sample)})
    if dd.get_spikein_counts(sample):
        out.append({"path": dd.get_spikein_counts(sample)})
    transcriptome_dir = os.path.join(dd.get_work_dir(sample), "inputs",
                                     "transcriptome")
    if os.path.exists(transcriptome_dir):
        out.append({
            "path": transcriptome_dir,
            "type": "directory",
            "ext": "transcriptome"
        })
    return _add_meta(out, config=upload_config)
Beispiel #43
0
def cpg_stats(sample):
    dtdepth = Counter()
    dtratio = Counter()
    work_dir = dd.get_work_dir(sample)
    sample_name = dd.get_sample_name(sample)
    depth_out = os.path.join(work_dir, "cpg_split", sample_name, "depth.tsv")
    ratio_out = os.path.join(work_dir, "cpg_split", sample_name, "ratio.tsv")
    hmc_out = os.path.join(work_dir, "cpg_split", sample_name, "%s_hmc.tsv.gz" % sample_name)
    hmc_files = " ".join(sample["hmc_split"])
    with file_transaction(hmc_out) as tx_out:
        header = " ".join(["chr", "pos", "strand", "context", "ratio", "eff_CT_counts",
                           "C_counts", "CT_counts", "rev_G_counts", "rev_GA_counts",
                           "CI_lover", "CI_upper", "ox_ratio", "ox_eff_CT_counts",
                           "ox_C_counts", "ox_CT_counts", "ox_rev_G_counts",
                           "ox_rev_GA_counts", "ox_CI_lower", "ox_CI_upper", "pvalue"])
        cmd = "cat <(echo {header} | sed 's/ /\t/g') {hmc_files} | gzip -c > {tx_out}"
        if not file_exists(hmc_out):
            do.run(cmd.format(**locals()), "Merging %s" % sample_name)
    work_bam = dd.get_work_bam(sample)
    if not file_exists(depth_out):
        for cpg_file in sample["cpg_split"]:
            logger.debug("Reading %s of sample %s" % (cpg_file, sample_name))
            if file_exists(cpg_file):
                 with open(cpg_file) as in_handle:
                     for line in in_handle:
                         cols = line.strip().split("\t")
                         if cols[3] == "CG":
                             ratio = int(float(cols[4]) * 100)
                             dtratio[ratio] += 1
                             depth = int(math.ceil(float(cols[5]))) if float(cols[5]) < 50 else 50
                             dtdepth[depth] += 1
        pd.DataFrame(dtdepth, index=[1]).to_csv(depth_out, sep="\t")
        pd.DataFrame(dtratio, index=[1]).to_csv(ratio_out, sep="\t")
    # calculate mlml
    if not hmc_files:
        return None
    out_dir = safe_makedir(os.path.join(work_dir, "mlml", sample_name))
    mlml_out = os.path.join(out_dir, "%s_mlml.txt.gz" % sample_name)
    if not file_exists(mlml_out) and file_exists(hmc_out):
        with chdir(out_dir):
            with file_transaction(mlml_out) as tx_out:
                tx_out_1 = "%s_noheader" % tx_out
                tx_out_2 = "%s_alone" % tx_out
                cmd = " ".join(["zcat %s | sed -e '1d' | awk " % hmc_out, ''' '{rounded = sprintf("%d", $14);print $1"\\t"$2"\t"$3"\\tCpG\\t"$13"\\t"rounded}' ''', "> %s_ox.txt" % sample_name])
                do.run(cmd, "Creating OX input for %s" % sample_name)
                cmd = " ".join(["zcat %s | sed -e '1d' | awk " % hmc_out, ''' '{rounded = sprintf("%d", $6);print $1"\\t"$2"\\t"$3"\\tCpG\\t"$5"\\t"rounded}' ''', "> %s_bs.txt" % sample_name])
                do.run(cmd, "Creating BS input for %s" % sample_name)
                cmd = ("mlml -o {tx_out_1} -u {sample_name}_bs.txt -m {sample_name}_ox.txt -v").format(**locals())
                do.run(cmd, "Run MLML with %s" % sample_name)
                cmd = ("cat <(echo chrom start end mC hmC C conflicts | sed 's/ /\t/g') {tx_out_1} | gzip -c > {tx_out_2} ").format(**locals())
                do.run(cmd, "")
                tx_out_1 = "%s.woFDR.gz" % tx_out
                cmd = ("paste <(zcat {hmc_out}) <(zcat {tx_out_2}) | gzip -c > {tx_out}").format(**locals())
                do.run(cmd, "Merge data for %s" % sample_name)

    merge_out = [os.path.join(out_dir, "%s_merged.txt.gz" % sample_name), os.path.join(out_dir, "%s_merged_pass.txt.gz" % sample_name)]
    if not file_exists(merge_out[0]):
        with file_transaction(merge_out) as tx_outs:
            tx_out, tx_out_pass = tx_outs
            df = pd.read_csv(mlml_out, sep="\t")
            import statsmodels.sandbox.stats.multicomp
            df["fdr"] = statsmodels.sandbox.stats.multicomp.fdrcorrection0(df["pvalue"])[1]
            df_p_pass = df[df.fdr<0.05]
            logger.debug("Pass FDR 5 pct in %s:%s " % (sample_name, float(df_p_pass.shape[1])/float(df.shape[1])))
            df.to_csv(tx_out, sep="\t")
            df_p_pass.to_csv(tx_out_pass, sep="\t")
Beispiel #44
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    orig_items = items
    items = [
        utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)
    ]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return orig_items
    out_files = {}
    for group_id, gitems in itertools.groupby(
            items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[
            tz.get_in(["depth", "bins", "target"], x),
            tz.get_in(["depth", "bins", "antitarget"], x)
        ] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(
            dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(
            os.path.join(dd.get_work_dir(inputs[00]), "structural",
                         dd.get_sample_name(inputs[0]), "bins"))
        back_file = cnvkit.cnvkit_background(
            cnns,
            os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
            backgrounds or inputs, target_bed, antitarget_bed)
        for data in inputs:
            work_dir = utils.safe_makedir(
                os.path.join(dd.get_work_dir(data), "structural",
                             dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = cnvkit.run_fix(
                    tz.get_in(["depth", "bins", "target"], data),
                    tz.get_in(["depth", "bins", "antitarget"],
                              data), back_file,
                    os.path.join(
                        work_dir,
                        "%s-normalized.cnr" % (dd.get_sample_name(data))),
                    data)
                out_files[dd.get_sample_name(data)] = fix_file
    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(
                data)]
        out.append([data])
    return out
Beispiel #45
0
def chipseq_count(data):
    """
    count reads mapping to ChIP/ATAC consensus peaks with featureCounts
    """
    method = dd.get_chip_method(data)
    if method == "chip":
        in_bam = dd.get_work_bam(data)
    elif method == "atac":
        if bam.is_paired(dd.get_work_bam(data)):
            in_bam = tz.get_in(("atac", "align", "NF"), data)
        else:
            in_bam = tz.get_in(("atac", "align", "full"), data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    consensus_file = tz.get_in(("peaks_files", "consensus", "main"), data)
    if not consensus_file:
        return [[data]]
    saf_file = os.path.splitext(consensus_file)[0] + ".saf"
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "consensus")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        if method == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
            else:
                data = tz.assoc_in(data, ("peak_counts", "full"), count_file)
        elif method == "chip":
            data = tz.assoc_in(data, ("peak_counts"), count_file)
        return [[data]]
    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = (
        "{featureCounts} -F SAF -a {saf_file} -o {tx_count_file} -s {strand_flag} "
        "{paired_flag} {sorted_bam}")

    message = ("Count reads in {sorted_bam} overlapping {saf_file} using "
               "featureCounts.")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)
    if method == "atac":
        if bam.is_paired(dd.get_work_bam(data)):
            data = tz.assoc_in(data, ("peak_counts", "NF"), count_file)
        else:
            data = tz.assoc_in(data, ("peak_counts", "full"), count_file)
    elif method == "chip":
        data = tz.assoc_in(data, ("peak_counts"), count_file)
    return [[data]]
Beispiel #46
0
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform,
                            dbsnp_file, intervals, data):
    """Step 1 of GATK recalibration process, producing table of covariates.

    For GATK 4 we use local multicore spark runs:
    https://github.com/broadinstitute/gatk/issues/2345

    For GATK3, Large whole genome BAM files take an excessively long time to recalibrate and
    the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis'
    plots in the GATK documentation:

    http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest

    This identifies large files and calculates the fraction to downsample to.
    """
    target_counts = 1e8  # 100 million reads per read group, 20x the plotted max
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.grp" %
        utils.splitext_plus(os.path.basename(dup_align_bam))[0])
    if not utils.file_exists(out_file):
        if has_aligned_reads(dup_align_bam, intervals):
            with file_transaction(data, out_file) as tx_out_file:
                gatk_type = broad_runner.gatk_type()
                assert gatk_type in ["restricted", "gatk4"], \
                    "Require full version of GATK 2.4+ or GATK4 for BQSR"
                params = ["-I", dup_align_bam]
                cores = dd.get_num_cores(data)
                if gatk_type == "gatk4":
                    params += [
                        "-T", "BaseRecalibratorSpark", "--sparkMaster",
                        "local[%s]" % cores, "--output", tx_out_file,
                        "--reference",
                        dd.get_ref_twobit(data)
                    ]
                else:
                    params += [
                        "-T", "BaseRecalibrator", "-o", tx_out_file, "-R",
                        ref_file
                    ]
                    downsample_pct = bam.get_downsample_pct(
                        dup_align_bam, target_counts, data)
                    if downsample_pct:
                        params += [
                            "--downsample_to_fraction",
                            str(downsample_pct), "--downsampling_type",
                            "ALL_READS"
                        ]
                    if platform.lower() == "solid":
                        params += [
                            "--solid_nocall_strategy", "PURGE_READ",
                            "--solid_recal_mode", "SET_Q_ZERO_BASE_N"
                        ]
                if dbsnp_file:
                    params += ["--knownSites", dbsnp_file]
                if intervals:
                    params += [
                        "-L", intervals, "--interval_set_rule", "INTERSECTION"
                    ]
                memscale = {
                    "magnitude": 0.9 * cores,
                    "direction": "increase"
                } if cores > 1 else None
                broad_runner.run_gatk(params,
                                      os.path.dirname(tx_out_file),
                                      memscale=memscale)
        else:
            with open(out_file, "w") as out_handle:
                out_handle.write("# No aligned reads")
    return out_file
Beispiel #47
0
def _get_files_project(sample, upload_config):
    """Retrieve output files associated with an entire analysis project.
    """
    out = [{"path": sample["provenance"]["programs"]}]
    for fname in ["bcbio-nextgen.log", "bcbio-nextgen-commands.log"]:
        if os.path.exists(
                os.path.join(log.get_log_dir(sample["config"]), fname)):
            out.append({
                "path":
                os.path.join(log.get_log_dir(sample["config"]), fname),
                "type":
                "external_command_log",
                "ext":
                ""
            })

    if "summary" in sample and sample["summary"].get("project"):
        out.append({"path": sample["summary"]["project"]})
    mixup_check = tz.get_in(["summary", "mixup_check"], sample)
    if mixup_check:
        out.append({
            "path": sample["summary"]["mixup_check"],
            "type": "directory",
            "ext": "mixup_check"
        })

    report = os.path.join(dd.get_work_dir(sample), "report")
    if utils.file_exists(report):
        out.append({"path": report, "type": "directory", "ext": "report"})

    if sample.get("seqcluster", None):
        out.append({
            "path": sample["seqcluster"],
            "type": "directory",
            "ext": "seqcluster"
        })

    for x in sample.get("variants", []):
        if "pop_db" in x:
            out.append({
                "path": x["pop_db"],
                "type": "sqlite",
                "variantcaller": x["variantcaller"]
            })
    for x in sample.get("variants", []):
        if "population" in x:
            pop_db = tz.get_in(["population", "db"], x)
            if pop_db:
                out.append({
                    "path": pop_db,
                    "type": "sqlite",
                    "variantcaller": x["variantcaller"]
                })
            out.extend(_get_variant_file(x, ("population", "vcf")))
    for x in sample.get("variants", []):
        if x.get("validate") and x["validate"].get("grading_summary"):
            out.append({"path": x["validate"]["grading_summary"]})
            break
    if "coverage" in sample:
        cov_db = tz.get_in(["coverage", "summary"], sample)
        if cov_db:
            out.append({"path": cov_db, "type": "sqlite", "ext": "coverage"})
        all_coverage = tz.get_in(["coverage", "all"], sample)
        if all_coverage:
            out.append({
                "path": all_coverage,
                "type": "bed",
                "ext": "coverage"
            })

    if dd.get_mirna_counts(sample):
        out.append({"path": dd.get_mirna_counts(sample)})
    if dd.get_isomir_counts(sample):
        out.append({"path": dd.get_isomir_counts(sample)})
    if dd.get_novel_mirna_counts(sample):
        out.append({"path": dd.get_novel_mirna_counts(sample)})
    if dd.get_novel_isomir_counts(sample):
        out.append({"path": dd.get_novel_isomir_counts(sample)})
    if dd.get_combined_counts(sample):
        out.append({"path": dd.get_combined_counts(sample)})
    if dd.get_annotated_combined_counts(sample):
        out.append({"path": dd.get_annotated_combined_counts(sample)})
    if dd.get_combined_fpkm(sample):
        out.append({"path": dd.get_combined_fpkm(sample)})
    if dd.get_combined_fpkm_isoform(sample):
        out.append({"path": dd.get_combined_fpkm_isoform(sample)})
    if dd.get_transcript_assembler(sample):
        out.append({"path": dd.get_merged_gtf(sample)})
    if dd.get_dexseq_counts(sample):
        out.append({"path": dd.get_dexseq_counts(sample)})
    if dd.get_express_counts(sample):
        out.append({"path": dd.get_express_counts(sample)})
    if dd.get_express_fpkm(sample):
        out.append({"path": dd.get_express_fpkm(sample)})
    if dd.get_express_tpm(sample):
        out.append({"path": dd.get_express_tpm(sample)})
    if dd.get_isoform_to_gene(sample):
        out.append({"path": dd.get_isoform_to_gene(sample)})
    if dd.get_square_vcf(sample):
        out.append({"path": dd.get_square_vcf(sample)})
    if dd.get_sailfish_tidy(sample):
        out.append({"path": dd.get_sailfish_tidy(sample)})
    if dd.get_sailfish_transcript_tpm(sample):
        out.append({"path": dd.get_sailfish_transcript_tpm(sample)})
    if dd.get_sailfish_gene_tpm(sample):
        out.append({"path": dd.get_sailfish_gene_tpm(sample)})
    return _add_meta(out, config=upload_config)
Beispiel #48
0
def normalize_sv_coverage(*items):
    """Normalize CNV coverage depths by GC, repeats and background.

    Provides normalized output based on CNVkit approaches, provides a
    point for providing additional methods in the future:

    - reference: calculates reference backgrounds from normals and pools
      including GC and repeat information
    - fix: Uses background to normalize coverage estimations
    http://cnvkit.readthedocs.io/en/stable/pipeline.html#fix
    """
    from bcbio.structural import cnvkit
    from bcbio.structural import shared as sshared
    items = [utils.to_single_data(x) for x in cwlutils.handle_combined_input(items)]
    if all(not cnvkit.use_general_sv_bins(x) for x in items):
        return [[d] for d in items]
    out_files = {}
    back_files = {}
    for group_id, gitems in itertools.groupby(items, lambda x: tz.get_in(["regions", "bins", "group"], x)):
        # No CNVkit calling for this particular set of samples
        if group_id is None:
            continue
        inputs, backgrounds = sshared.find_case_control(list(gitems))
        cnns = reduce(operator.add, [[tz.get_in(["depth", "bins", "target"], x),
                                      tz.get_in(["depth", "bins", "antitarget"], x)] for x in backgrounds], [])
        assert inputs, "Did not find inputs for sample batch: %s" % (" ".join(dd.get_sample_name(x) for x in items))
        for d in inputs:
            if tz.get_in(["depth", "bins", "target"], d):
                target_bed = tz.get_in(["depth", "bins", "target"], d)
                antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], d)
        work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(inputs[0]), "structural",
                                                   dd.get_sample_name(inputs[0]), "bins"))
        input_backs = set(filter(lambda x: x is not None,
                                 [dd.get_background_cnv_reference(d) for d in inputs]))
        if input_backs:
            assert len(input_backs) == 1, "Multiple backgrounds in group: %s" % list(input_backs)
            back_file = list(input_backs)[0]
        else:
            back_file = cnvkit.cnvkit_background(cnns,
                                                 os.path.join(work_dir, "background-%s-cnvkit.cnn" % (group_id)),
                                                backgrounds or inputs, target_bed, antitarget_bed)
        fix_cmd_inputs = []
        for data in inputs:
            work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                                       dd.get_sample_name(data), "bins"))
            if tz.get_in(["depth", "bins", "target"], data):
                fix_file = os.path.join(work_dir, "%s-normalized.cnr" % (dd.get_sample_name(data)))
                fix_cmd_inputs.append((tz.get_in(["depth", "bins", "target"], data),
                                       tz.get_in(["depth", "bins", "antitarget"], data),
                                       back_file, fix_file, data))
                out_files[dd.get_sample_name(data)] = fix_file
                back_files[dd.get_sample_name(data)] = back_file
        parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]}
        run_multicore(cnvkit.run_fix_parallel, fix_cmd_inputs, inputs[0]["config"], parallel)

    out = []
    for data in items:
        if dd.get_sample_name(data) in out_files:
            data["depth"]["bins"]["background"] = back_files[dd.get_sample_name(data)]
            data["depth"]["bins"]["normalized"] = out_files[dd.get_sample_name(data)]
        out.append([data])
    return out
Beispiel #49
0
def work_dir(data):
    return utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "coverage",
                     dd.get_sample_name(data), "sambamba"))
def run_mosdepth(data,
                 target_name,
                 bed_file,
                 per_base=False,
                 quantize=None,
                 thresholds=None):
    """Run mosdepth generating distribution, region depth and per-base depth.
    """
    MosdepthCov = collections.namedtuple(
        "MosdepthCov",
        ("dist", "per_base", "regions", "quantize", "thresholds"))
    bam_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    work_dir = utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "coverage",
                     dd.get_sample_name(data)))
    prefix = os.path.join(work_dir,
                          "%s-%s" % (dd.get_sample_name(data), target_name))
    old_dist_file = "%s.mosdepth.dist.txt" % (prefix)
    out = MosdepthCov(
        (old_dist_file if utils.file_uptodate(
            old_dist_file, bam_file) else "%s.mosdepth.%s.dist.txt" %
         (prefix, "region" if bed_file else "global")),
        ("%s.per-base.bed.gz" % prefix) if per_base else None,
        ("%s.regions.bed.gz" % prefix) if bed_file else None,
        ("%s.quantized.bed.gz" % prefix) if quantize else None,
        ("%s.thresholds.bed.gz" % prefix) if thresholds else None)
    if not utils.file_uptodate(out.dist, bam_file):
        with file_transaction(data, out.dist) as tx_out_file:
            tx_prefix = os.path.join(os.path.dirname(tx_out_file),
                                     os.path.basename(prefix))
            num_cores = dd.get_cores(data)
            bed_arg = ("--by %s" % bed_file) if bed_file else ""
            perbase_arg = "" if per_base else "--no-per-base"
            mapq_arg = "-Q 1" if (per_base or quantize) else ""
            if quantize:
                quant_arg = "--quantize %s" % quantize[0]
                quant_export = " && ".join([
                    "export MOSDEPTH_Q%s=%s" % (i, x)
                    for (i, x) in enumerate(quantize[1])
                ])
                quant_export += " && "
            else:
                quant_arg, quant_export = "", ""

            thresholds_cmdl = (
                "-T " +
                ",".join([str(t)
                          for t in thresholds])) if out.thresholds else ""
            cmd = (
                "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} "
                "{tx_prefix} {bam_file} {thresholds_cmdl}")
            message = "Calculating coverage: %s %s" % (
                dd.get_sample_name(data), target_name)
            do.run(cmd.format(**locals()), message.format(**locals()))
            if out.per_base:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.per_base)), out.per_base)
            if out.regions:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.regions)), out.regions)
            if out.quantize:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.quantize)), out.quantize)
            if out.thresholds:
                shutil.move(
                    os.path.join(os.path.dirname(tx_out_file),
                                 os.path.basename(out.thresholds)),
                    out.thresholds)
    return out
Beispiel #51
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                if vcinfo["vrn_file"] and vcfutils.vcf_has_nonfiltered_variants(
                        vcinfo["vrn_file"]):
                    vcf_file = vcinfo["vrn_file"]
                    break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not vcfanno.is_human(data):
        if not peddy:
            reason = "peddy executable not found"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            cmd = (
                "{peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return ((l.find("IndexError") >= 0
                             and l.find("is out of bounds for axis") >= 0) or
                            (l.find("n_components=") >= 0 and
                             l.find("must be between 1 and n_features=") >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Beispiel #52
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    file_fapths = []
    opts = ""
    out_dir = os.path.join(work_dir, "multiqc")
    out_data = os.path.join(work_dir, "multiqc", "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    samples = _report_summary(samples, os.path.join(out_dir, "report"))
    for data in samples:
        for program, pfiles in tz.get_in(["summary", "qc"], data,
                                         {}).iteritems():
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif isinstance(pfiles, basestring):
                pfiles = [pfiles]
            file_fapths.extend(pfiles)
    file_fapths.append(
        os.path.join(out_dir, "report", "metrics", "target_info.yaml"))
    # XXX temporary workaround until we can handle larger inputs through MultiQC
    file_fapths = list(set(file_fapths))
    # Back compatible -- to migrate to explicit specifications in input YAML
    file_fapths += ["trimmed", "htseq-count/*summary"]
    if not utils.file_exists(out_file):
        with utils.chdir(work_dir):
            file_fapths = [
                fpath for fpath in file_fapths if _check_multiqc_input(fpath)
                and _is_good_file_for_multiqc(fpath)
            ]
            input_list_file = _create_list_file(file_fapths)
            export_tmp = ""
            if dd.get_tmp_dir(samples[0]):
                export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
            if input_list_file:
                cmd = "{export_tmp} {multiqc} -f -l {input_list_file} -o {tx_out} {opts}"
                with tx_tmpdir(data, work_dir) as tx_out:
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    out = []
    for i, data in enumerate(samples):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(
                    os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report",
                                                     "*.R*"))
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {
                    "base": out_file,
                    "secondary": data_files
                }
        out.append(data)
    return [[fpath] for fpath in out]
Beispiel #53
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(
                vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(
                    d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info(
            "peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
            "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if to_show[-1].find("IndexError") >= 0 and to_show[-1].find(
                    "is out of bounds for axis") >= 0:
                logger.info(
                    "Skipping peddy because no variants overlap with checks: %s"
                    % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write(
                        "peddy did not find overlaps with 1kg sites in VCF, skipping"
                    )
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Beispiel #54
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug(
            "multiqc not found. Update bcbio_nextgen.py tools to fix this issue."
        )
    folders = []
    opts = ""
    out_dir = os.path.join(work_dir, "multiqc")
    out_data = os.path.join(work_dir, "multiqc", "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    samples = _report_summary(samples, os.path.join(out_dir, "report"))
    for data in samples:
        for program, pfiles in tz.get_in(["summary", "qc"], data,
                                         {}).iteritems():
            if isinstance(pfiles, dict):
                pfiles = pfiles["base"]
            folders.append(os.path.dirname(pfiles))
    # XXX temporary workaround until we can handle larger inputs through MultiQC
    folders = list(set(folders))
    if len(folders) > 250:
        logger.warning(
            "Too many samples for MultiQC, only using first 250 entries.")
        folders = folders[:250]
        opts = "--flat"
    # Back compatible -- to migrate to explicit specifications in input YAML
    folders += ["trimmed", "htseq-count/*summary"]
    if not utils.file_exists(out_file):
        with utils.chdir(work_dir):
            input_dir = " ".join([_check_multiqc_input(d) for d in folders])
            export_tmp = ""
            if dd.get_tmp_dir(samples[0]):
                export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
            if input_dir.strip():
                cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}"
                with tx_tmpdir(data, work_dir) as tx_out:
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(
                            os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(
                            os.path.join(tx_out, "multiqc_report.html"),
                            out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"),
                                    out_data)
    out = []
    for i, data in enumerate(samples):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(
                    os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(
                    os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report",
                                                     "*.R*"))
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {
                    "base": out_file,
                    "secondary": data_files
                }
        out.append(data)
    return [[d] for d in out]
Beispiel #55
0
def _report_summary(samples, out_dir):
    """
    Run coverage report with bcbiocov package
    """
    try:
        import bcbreport.prepare as bcbreport
    except ImportError:
        logger.info("skipping report. No bcbreport installed.")
        return samples
    # samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    parent_dir = utils.safe_makedir(out_dir)
    with utils.chdir(parent_dir):
        logger.info("copy qsignature")
        qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma")
        if qsignature_fn:  # this need to be inside summary/qc dict
            if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"):
                shutil.copy(qsignature_fn, "bcbio_qsignature.ma")

        out_dir = utils.safe_makedir("fastqc")
        logger.info("summarize fastqc")
        with utils.chdir(out_dir):
            _merge_fastqc(samples)

        logger.info("summarize target information")
        if samples[0].get("analysis", "").lower() in ["variant", "variant2"]:
            samples = _merge_target_information(samples)

        out_dir = utils.safe_makedir("coverage")
        logger.info("summarize coverage")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "coverage"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("coverage_fixed") > -1:
                    utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn)))

        out_dir = utils.safe_makedir("variants")
        logger.info("summarize variants")
        for data in samples:
            pfiles = tz.get_in(["summary", "qc", "variants"], data, [])
            if isinstance(pfiles, dict):
                pfiles = [pfiles["base"]] + pfiles["secondary"]
            elif pfiles:
                pfiles = [pfiles]
            for fn in pfiles:
                if os.path.basename(fn).find("gc-depth-parse.tsv") > -1:
                    utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn)))
        bcbreport.report(parent_dir)
        out_report = os.path.join(parent_dir, "qc-coverage-report.html")
        if not utils.file_exists(out_report):
            rmd_file = os.path.join(parent_dir, "report-ready.Rmd")
            run_file = "%s-run.R" % (os.path.splitext(out_report)[0])
            with open(run_file, "w") as out_handle:
                out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file)
            # cmd = "%s %s" % (utils.Rscript_cmd(), run_file)
            # Skip automated generation of coverage report to avoid error
            # messages. We need to generalize coverage reporting and re-include.
            # try:
            #     do.run(cmd, "Prepare coverage summary", log_error=False)
            # except subprocess.CalledProcessError as msg:
            #     logger.info("Skipping generation of coverage report: %s" % (str(msg)))
            if utils.file_exists("report-ready.html"):
                shutil.move("report-ready.html", out_report)
    return samples
Beispiel #56
0
def _sv_workdir(data):
    return utils.safe_makedir(
        os.path.join(dd.get_work_dir(data), "structural",
                     dd.get_sample_name(data), "purecn"))
Beispiel #57
0
def merge_split_alignments(samples, run_parallel):
    """Manage merging split alignments back into a final working BAM file.

    Perform de-duplication on the final merged file.
    """
    ready = []
    file_key = "work_bam"
    to_merge = collections.defaultdict(list)
    for data in (xs[0] for xs in samples):
        if data.get("combine"):
            out_key = tz.get_in(["combine", file_key, "out"], data)
            if not out_key:
                out_key = data["rgnames"]["lane"]
            to_merge[out_key].append(data)
        else:
            ready.append([data])
    ready_merge = []
    hla_merges = []
    for mgroup in to_merge.values():
        cur_data = mgroup[0]
        del cur_data["align_split"]
        for x in mgroup[1:]:
            cur_data["combine"][file_key]["extras"].append(x[file_key])
        ready_merge.append([cur_data])
        cur_hla = None
        for d in mgroup:
            hla_files = tz.get_in(["hla", "fastq"], d)
            if hla_files:
                if not cur_hla:
                    cur_hla = {
                        "rgnames": {
                            "sample": dd.get_sample_name(cur_data)
                        },
                        "config": cur_data["config"],
                        "dirs": cur_data["dirs"],
                        "hla": {
                            "fastq": []
                        }
                    }
                cur_hla["hla"]["fastq"].append(hla_files)
        if cur_hla:
            hla_merges.append([cur_hla])
    if not tz.get_in(["config", "algorithm", "kraken"], data):
        # kraken requires fasta filenames from data['files'] as input.
        # We don't want to remove those files if kraken qc is required.
        _save_fastq_space(samples)
    merged = run_parallel("delayed_bam_merge", ready_merge)
    hla_merge_raw = run_parallel("merge_split_alignments", hla_merges)
    hla_merges = {}
    for hla_merge in [x[0] for x in hla_merge_raw]:
        hla_merges[dd.get_sample_name(hla_merge)] = tz.get_in(["hla", "fastq"],
                                                              hla_merge)

    # Add stable 'align_bam' target to use for retrieving raw alignment
    out = []
    for data in [x[0] for x in merged + ready]:
        if data.get("work_bam"):
            data["align_bam"] = data["work_bam"]
        if dd.get_sample_name(data) in hla_merges:
            data["hla"]["fastq"] = hla_merges[dd.get_sample_name(data)]
        else:
            hla_files = glob.glob(
                os.path.join(dd.get_work_dir(data), "align",
                             dd.get_sample_name(data), "hla", "*.fq"))
            if hla_files:
                data["hla"]["fastq"] = hla_files
        out.append([data])
    return out