Esempio n. 1
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = clean_ATAC(data)
    # for ATAC-seq, this will be the NF BAM
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 2
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                        "If BAM is not cleaned for peak calling, can result in downstream errors.")
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 3
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info(
                "Warning: When BAM file is given as input, bcbio skips multimappers removal."
                "If BAM is not cleaned for peak calling, can result in downstream errors."
            )
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data),
                                             dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed,
                                        data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                   dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 4
0
def apply_recal(data):
    """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM.
    """
    orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    had_work_bam = "work_bam" in data
    if dd.get_recalibrate(data) in [True, "gatk"]:
        logger.info("Applying BQSR recalibration with GATK: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = _gatk_apply_bqsr(data)
    elif dd.get_recalibrate(data) == "sentieon":
        logger.info("Applying BQSR recalibration with sentieon: %s " %
                    str(dd.get_sample_name(data)))
        data["work_bam"] = sentieon.apply_bqsr(data)
    elif dd.get_recalibrate(data):
        raise NotImplementedError("Unsupported recalibration type: %s" %
                                  (dd.get_recalibrate(data)))
    # CWL does not have work/alignment BAM separation
    if not had_work_bam and dd.get_work_bam(data):
        data["align_bam"] = dd.get_work_bam(data)
    if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(
            data):
        utils.save_diskspace(orig_bam,
                             "BAM recalibrated to %s" % dd.get_work_bam(data),
                             data["config"])
    return data
Esempio n. 5
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Esempio n. 6
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = ["-T", "VariantAnnotator",
                              "-R", ref_file,
                              "-L", cleaned_bed,
                              "-I", in_bam,
                              "-A", "GCContent",
                              "-A", "Coverage",
                              "--variant", in_vcf,
                              "--out", tx_out]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >>out_handle, "CG\tdepth\tsample"
                    cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                            "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                            "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Esempio n. 7
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
    if aligner == "bowtie2":
        data["raw_bam"] = dd.get_work_bam(data)
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
        return [[data]]
    return [[data]]
Esempio n. 8
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
    if aligner == "bowtie2":
        data["raw_bam"] = dd.get_work_bam(data)
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
        return [[data]]
    return [[data]]
Esempio n. 9
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data["vrn_file"]
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        sample = dd.get_sample_name(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T",
                        "VariantAnnotator",
                        "-R",
                        ref_file,
                        "-L",
                        bed_file,
                        "-I",
                        in_bam,
                        "-A",
                        "GCContent",
                        "-A",
                        "Coverage",
                        "--variant",
                        in_vcf,
                        "--out",
                        tx_out,
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, "w") as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}"
                    )
                    do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug("parsing coverage: %s" % sample)
        return data
Esempio n. 10
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["raw_bam"] = dd.get_work_bam(data)
    if aligner:
        assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
    else:
        logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                    "If BAM is not cleaned for peak calling, can result in downstream errors.")
    return [[data]]
Esempio n. 11
0
def _get_replicate_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    rep_bam = ""
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control":
            if rep_bam != "":
                rep_bam = rep_bam + "," + dd.get_work_bam(origin[0])
            else:
                rep_bam = dd.get_work_bam(origin[0])
    sample["work_bam_rep"] = dd.get_work_bam(origin[0])
    return [sample]
Esempio n. 12
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    input_bam = ""
    for origin in data:
        if  dd.get_phenotype(origin[0]) == "control":
            if input_bam != "":
                input_bam = input_bam + "," + dd.get_work_bam(origin[0])
            else:
                input_bam = dd.get_work_bam(origin[0])
    sample["work_bam_input"] = input_bam
    return [sample]
Esempio n. 13
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data['vrn_file']
    sample = dd.get_sample_name(data)
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        if file_exists(qc_file):
            return data
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        bed_file, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                os.remove(cg_file)
        return data
Esempio n. 14
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            ref_file = dd.get_ref_file(data)
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
                   "-i /dev/stdin -o /dev/stdout | "
                   "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Esempio n. 15
0
def _calculate_sv_coverage_cnvkit(data, work_dir):
    """Calculate coverage in an CNVkit ready format using mosdepth.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate
    out_target_file = os.path.join(
        work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(
        work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file)
         or not utils.file_exists(out_anti_file))
            and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        target_cov = coverage.run_mosdepth(
            data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(
            data, "antitarget",
            tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions,
                                              data,
                                              max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions,
                                            data,
                                            max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file,
                                          data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
    return out_target_file, out_anti_file
Esempio n. 16
0
def dedup_bismark(data):
    """Remove alignments to the same position in the genome from the Bismark
    mapping output using deduplicate_bismark
    """
    config = data["config"]
    input_file = datadict.get_work_bam(data)
    # don't sort even by read names
    # input_file = bam.sort(input_file, config, order="queryname")
    sample_name = datadict.get_sample_name(data)
    output_dir = os.path.join(datadict.get_work_dir(data), 'dedup',
                              sample_name)
    output_dir = utils.safe_makedir(output_dir)

    input_file_name, input_file_extension = os.path.splitext(os.path.basename(
        input_file
    ))
    output_file = os.path.join(
        output_dir, f'{input_file_name}.deduplicated{input_file_extension}'
    )

    if utils.file_exists(output_file):
        data = datadict.set_work_bam(data, output_file)
        return [[data]]

    deduplicate_bismark = config_utils.get_program('deduplicate_bismark', config)
    command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}'
    with transaction.file_transaction(output_dir):
        do.run(command, 'remove deduplicate alignments')

    data = datadict.set_work_bam(data, output_file)
    data["deduplication_report"] = output_file.replace("deduplicated.bam", "deduplication_report.txt")
    return [[data]]
Esempio n. 17
0
def coverage_region_detailed_stats(data, out_dir):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return None
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(parse_file) as out_tx:
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed,
                                             depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100],
                                             max_cov=1000)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample)
        parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample)
    return os.path.abspath(parse_file)
Esempio n. 18
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Esempio n. 19
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Esempio n. 20
0
def extract_NF_regions(data):
    """
    extract the nucleosome free regions from the work_bam. These regions will
    be < 100 bases
    """
    MAX_FRAG_LENGTH = 100
    sieve = config_utils.get_program("alignmentSieve", data)
    work_bam = dd.get_work_bam(data)
    num_cores = dd.get_num_cores(data)
    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
    if file_exists(out_file):
        data["NF_bam"] = out_file
        return data

    with file_transaction(out_file) as tx_out_file, \
         file_transaction(log_file) as tx_log_file:
        tx_unsorted_bam = tx_out_file + ".unsorted"
        cmd = (
            f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
            f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
            f"--minMappingQuality 10 "
            f"--filterMetrics {tx_log_file} ")
        do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
        tx_out_file = bam.sort(tx_unsorted_bam)

    data["NF_bam"] = out_file
    return data
Esempio n. 21
0
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(
        dd.get_work_dir(data), "align", dd.get_sample_name(data),
        "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            if gatk_type == "gatk4":
                params = [
                    "-T", "ApplyBQSRSpark", "--sparkMaster",
                    "local[%s]" % dd.get_num_cores(data), "--input", in_file,
                    "--output", tx_out_file, "--bqsr_recal_file",
                    data["prep_recal"]
                ]
            else:
                params = [
                    "-T", "PrintReads", "-R",
                    dd.get_ref_file(data), "-I", in_file, "-BQSR",
                    data["prep_recal"], "-o", tx_out_file
                ]
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file))
    bam.index(out_file, data["config"])
    return out_file
Esempio n. 22
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {in_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, count_file) as tx_count_file:
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    os.rename(fixed_count_file, count_file)

    return count_file
Esempio n. 23
0
 def _prep_data(data, items):
     for r in ["callable_regions", "variant_regions"]:
         data[r] = list(set(filter(lambda x: x is not None,
                                   [tz.get_in(("config", "algorithm", r), d) for d in items])))
     data["work_bams"] = [dd.get_align_bam(x) or dd.get_work_bam(x) for x in items]
     data["vrn_files"] = [x["vrn_file"] for x in items]
     return data
def priority_total_coverage(data):
    """
    calculate coverage at depth 20 in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data

    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with file_transaction(out_file) as tx_out_file:
        cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} "
               "-F \"not unmapped\" "
               "-T 20 {in_bam} -o {tx_out_file}")
        message = "Calculating coverage of {bed_file} regions in {in_bam}"
        do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Esempio n. 25
0
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file or not utils.file_exists(bed_file):
        return []
    work_dir = safe_makedir(out_dir)
    cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True)

    cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000}

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam):
            pass
        else:
            with file_transaction(data, parse_file) as out_tx:
                depth_thresholds = sorted(list(cutoffs | extra_cutoffs))
                cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds)
                cmdl += " | sed 's/# chrom/chrom/' > " + out_tx
                do.run(cmdl, "Run coverage regional analysis for {}".format(sample))
        out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs)
    return [os.path.abspath(x) for x in out_files]
Esempio n. 26
0
def assign_interval(data):
    """Identify coverage based on percent of genome covered and relation to targets.

    Classifies coverage into 3 categories:
      - genome: Full genome coverage
      - regional: Regional coverage, like exome capture, with off-target reads
      - amplicon: Amplication based regional coverage without off-target reads
    """
    if not dd.get_coverage_interval(data):
        vrs = dd.get_variant_regions_merged(data)
        callable_file = dd.get_sample_callable(data)
        if vrs:
            callable_size = pybedtools.BedTool(vrs).total_coverage()
        else:
            callable_size = pybedtools.BedTool(callable_file).total_coverage()
        total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])])
        genome_cov_pct = callable_size / float(total_size)
        if genome_cov_pct > GENOME_COV_THRESH:
            cov_interval = "genome"
            offtarget_pct = 0.0
        elif not vrs:
            cov_interval = "regional"
            offtarget_pct = 0.0
        else:
            offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data),
                                             vrs or callable_file, "variant_regions")
            if offtarget_pct > OFFTARGET_THRESH:
                cov_interval = "regional"
            else:
                cov_interval = "amplicon"
        logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage"
                    % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0))
        data["config"]["algorithm"]["coverage_interval"] = cov_interval
    return data
Esempio n. 27
0
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]
    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out
    parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1}
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file
Esempio n. 28
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = (
        "{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} "
    )
    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        cmd += "| " + postalign.sam_to_sortbam_cl(
            data, tx_out_file, name_sort=True)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Esempio n. 29
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    if data["work_bam_rep"] != "":
        rep_bam = data["work_bam_rep"]
    else:
        rep_bam = ""
    input_bam = data["work_bam_input"]
    caller_fn = get_callers()[data["rmats_fn"]]
    name = dd.get_sample_name(data)
    fastq_file = fastq.get_fastq_files(data)
    read_len = bam.fastq.estimate_read_length(fastq_file[0])
    if read_len < 50:
        read_len = 50
    elif read_len > 50 and read_len < 75:
        read_len = 75
    else:
        read_len = 100
    if len(fastq_file) > 1:
        read_pair = "paired"
    else:
        read_pair = "single"
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["rmats_fn"], name ))
    out_file = caller_fn(name, chip_bam, rep_bam, input_bam, dd.get_gtf_file(data), out_dir, read_len, read_pair, data["config"])
    data["rmats_file"] = out_file
    return [[data]]
Esempio n. 30
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Esempio n. 31
0
def run(items, config):
    """Run third party disambiguation script, resolving into single set of calls.
    """
    assert len(items) == 2, "Can only resolve two organism disambiguation"
    # check aligner, handling tophat/tophat2 distinctions
    aligner = config["algorithm"].get("aligner")
    aligner = "tophat" if aligner.startswith("tophat") else aligner
    assert aligner in ["bwa", "hisat2", "tophat", "star"], "Disambiguation only supported for bwa, hisat2, star and tophat alignments."
    if items[0]["disambiguate"].get("base"):
        data_a, data_b = items
    else:
        data_b, data_a = items
    work_bam_a = bam.sort(data_a["work_bam"], config, "queryname")
    work_bam_b = bam.sort(data_b["work_bam"], config, "queryname")
    if data_a.get("align_split"):
        base_dir = utils.safe_makedir(os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                                    os.pardir, os.pardir,
                                                                    "disambiguate_%s" % aligner)))
        out_dir = os.path.join(base_dir, "_".join([str(x) for x in data_a["align_split"].split("-")]))
    else:
        out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a),
                                                os.pardir, "disambiguate_%s" % aligner))
    base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0])
    summary_file = "%s_summary.txt" % base_name
    if not utils.file_exists(summary_file):
        with file_transaction(items[0], out_dir) as tx_out_dir:
            _run_cplusplus(work_bam_a, work_bam_b, tx_out_dir, aligner, os.path.basename(base_name), items)
    data_a["disambiguate"] = \
      {data_b["genome_build"]: bam.sort("%s.disambiguatedSpeciesB.bam" % base_name, config),
       "%s-ambiguous" % data_a["genome_build"]: bam.sort("%s.ambiguousSpeciesA.bam" % base_name, config),
       "%s-ambiguous" % data_b["genome_build"]: bam.sort("%s.ambiguousSpeciesB.bam" % base_name, config),
       "summary": summary_file}
    data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config)
    bam.index(dd.get_work_bam(data_a), data_a["config"])
    return [[data_a]]
Esempio n. 32
0
def run_stringtie_expression(data):
    """
    estimate expression from Stringtie, using the bcbio datadict
    does not do transcriptome assembly
    """
    bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join("stringtie", sample_name)
    isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm")
    gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm")
    if file_exists(isoform_fpkm) and file_exists(gene_fpkm):
        data = dd.set_cufflinks_dir(data, out_dir)
        data = dd.set_fpkm(data, gene_fpkm)
        data = dd.set_fpkm_isoform(data, isoform_fpkm)
        return data
    with file_transaction(data, out_dir) as tx_out_dir:
        exon_file, transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir)
        df = _parse_ballgown(transcript_file)
        _write_fpkms(df, tx_out_dir, sample_name)
    data = dd.set_cufflinks_dir(data, out_dir)
    data = dd.set_fpkm(data, gene_fpkm)
    data = dd.set_fpkm_isoform(data, isoform_fpkm)
    return data
Esempio n. 33
0
def run_stringtie_expression(data):
    """
    estimate expression from Stringtie, using the bcbio datadict
    does not do transcriptome assembly
    """
    bam = dd.get_work_bam(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join("stringtie", sample_name)
    isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm")
    gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm")
    assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf"))
    if file_exists(isoform_fpkm) and file_exists(gene_fpkm):
        data = dd.set_stringtie_dir(data, out_dir)
        data = dd.set_fpkm(data, gene_fpkm)
        data = dd.set_fpkm_isoform(data, isoform_fpkm)
        if "stringtie" in dd.get_transcript_assembler(data):
            assembled_gtfs = dd.get_assembled_gtf(data)
            assembled_gtfs.append(assembly)
            data = dd.set_assembled_gtf(data, assembled_gtfs)
        return data
    with file_transaction(data, out_dir) as tx_out_dir:
        transcript_file = _stringtie_expression(bam, data, tx_out_dir)
        df = _parse_ballgown(transcript_file)
        _write_fpkms(df, tx_out_dir, sample_name)
    data = dd.set_stringtie_dir(data, out_dir)
    data = dd.set_fpkm(data, gene_fpkm)
    data = dd.set_fpkm_isoform(data, isoform_fpkm)
    if "stringtie" in dd.get_transcript_assembler(data):
        assembled_gtfs = dd.get_assembled_gtf(data)
        assembled_gtfs.append(assembly)
        data = dd.set_assembled_gtf(data, assembled_gtfs)
    return data
Esempio n. 34
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data) or dd.get_work_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Esempio n. 35
0
def calculate_sv_coverage(data):
    """Calculate coverage within bins for downstream CNV calling.

    Creates corrected cnr files with log2 ratios and depths.
    """
    from bcbio.variation import coverage
    from bcbio.structural import annotate, cnvkit
    data = utils.to_single_data(data)
    if not cnvkit.use_general_sv_bins(data):
        return [[data]]
    work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural",
                                               dd.get_sample_name(data), "bins"))
    out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data))
    out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data))
    if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file))
          and (dd.get_align_bam(data) or dd.get_work_bam(data))):
        # mosdepth
        target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data))
        anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data))
        target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0)
        anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0)
        out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data)
        out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data)
        # TODO: Correct for GC bias
    if os.path.exists(out_target_file):
        data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file}
    return [[data]]
Esempio n. 36
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(
            bed_file):
        return {}
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-")
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate(
            out_file, in_bam):
        return out_file
    cmdl = sambamba.make_command(
        data,
        "depth region",
        in_bam,
        cleaned_bed,
        depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    with file_transaction(out_file) as tx_out_file:
        message = "Calculating region coverage of {bed_file} in {in_bam}"
        do.run(cmdl + " -o " + tx_out_file, message.format(**locals()))
    logger.debug("Saved svprioritize coverage into " + out_file)
    return out_file
Esempio n. 37
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage"))
    if not bed_file:
        return data
    cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed")
    cleaned_bed = bed.decomment(bed_file, cleaned_bed)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        _calculate_percentiles(os.path.abspath(parse_file), sample)
        data['coverage'] = os.path.abspath(parse_file)
    return data
Esempio n. 38
0
def rnaseq_vardict_variant_calling(data):
    sample = dd.get_sample_name(data)
    variation_dir = os.path.join(dd.get_work_dir(data), "variation")
    safe_makedir(variation_dir)
    out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz")
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    vardict_cmd = vardict.get_vardict_command(data)
    strandbias = "teststrandbias.R"
    var2vcf = "var2vcf_valid.pl"
    vcfstreamsort = config_utils.get_program("vcfstreamsort", data)
    compress_cmd = "| bgzip -c"
    freq = float(dd.get_min_allele_fraction(data, 20) / 100.0)
    var2vcf_opts = "-v 50"
    fix_ambig = vcfutils.fix_ambiguous_cl()
    remove_dup = vcfutils.remove_dup_cl()
    r_setup = ("unset R_HOME && export PATH=%s:$PATH && "
                % os.path.dirname(Rscript_cmd()))
    ref_file = dd.get_ref_file(data)
    bamfile = dd.get_work_bam(data)
    bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data))
    opts = " -c 1 -S 2 -E 3 -g 4 "
    with file_transaction(out_file) as tx_out_file:
        jvm_opts = vardict._get_jvm_opts(data, tx_out_file)
        cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} "
                "-N {sample} -b {bamfile} {opts} {bed_file} "
                "| {strandbias}"
                "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} "
                "> {tx_out_file}")
        message = "Calling RNA-seq variants with VarDict"
        do.run(cmd.format(**locals()), message)
    data = dd.set_vrn_file(data, out_file)
    return data
Esempio n. 39
0
def split_ATAC(data, bam_file=None):
    """
    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
    on the estimated insert sizes
    uses the current working BAM file if no BAM file is supplied
    """
    sambamba = config_utils.get_program("sambamba", data)
    num_cores = dd.get_num_cores(data)
    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
    bam_file = bam_file if bam_file else dd.get_work_bam(data)
    out_stem = os.path.splitext(bam_file)[0]
    split_files = {}
    # we can only split these fractions from paired runs
    if not bam.is_paired(bam_file):
        split_files["full"] = bam_file
        data = tz.assoc_in(data, ['atac', 'align'], split_files)
        return data
    for arange in ATACRanges.values():
        out_file = f"{out_stem}-{arange.label}.bam"
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                cmd = base_cmd +\
                    f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
                    f'{bam_file} > {tx_out_file}'
                message = f'Splitting {arange.label} regions from {bam_file}.'
                do.run(cmd, message)
            bam.index(out_file, dd.get_config(data))
        split_files[arange.label] = out_file
    split_files["full"] = bam_file
    data = tz.assoc_in(data, ['atac', 'align'], split_files)
    return data
Esempio n. 40
0
def correct_umis(data):
    """Correct umis against the whitelist in correct_umi_file

    http://fulcrumgenomics.github.io/fgbio/tools/latest/CorrectUmis.html
    """
    input_bam = dd.get_work_bam(data)
    output_bam = os.path.join(
        utils.safe_makedir(
            os.path.join(os.getcwd(), "align",
                         dd.get_sample_name(data))), "%s-umis_corrected%s" %
        utils.splitext_plus(os.path.basename(input_bam)))
    jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(output_bam), 2)
    # Improve speeds by avoiding compression read/write bottlenecks
    io_opts = "--async-io=true --compression=0"
    umis_whitelist = tz.get_in(["config", "algorithm", "correct_umis"], data)

    fgbio = config_utils.get_program("fgbio", data["config"])
    samtools = config_utils.get_program("samtools", data["config"])

    if not utils.file_exists(output_bam):
        umi_method, umi_tag = _check_umi_type(input_bam)
        cmd = (
            "unset JAVA_HOME && "
            "{fgbio} {jvm_opts} {io_opts} CorrectUmis "
            "-t {umi_tag} -m 3 -d 1 -x "
            "-U {umis_whitelist} "
            "-i {input_bam} -o /dev/stdout | {samtools} view -bh > {output_bam}"
        )
        do.run(cmd.format(**locals()), "Correcting UMIs")
        bam.index(output_bam, data["config"])
    return output_bam
Esempio n. 41
0
def _run_variantcall_batch_multicore(items, regions, final_file):
    """Run variant calling on a batch of items using multiple cores.
    """
    batch_name = _get_batch_name(items)
    variantcaller = _get_batch_variantcaller(items)
    work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items]

    def split_fn(data):
        out = []
        for region in regions:
            region = _region_to_coords(region)
            chrom, start, end = region
            region_str = "_".join(str(x) for x in region)
            out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller,
                                    chrom,
                                    "%s-%s.vcf.gz" % (batch_name, region_str))
            out.append((region, work_bams, out_file))
        return final_file, out

    parallel = {
        "type": "local",
        "num_jobs": dd.get_num_cores(items[0]),
        "cores_per_job": 1
    }
    run_parallel = dmulti.runner(parallel, items[0]["config"])
    to_run = copy.deepcopy(items[0])
    to_run["sam_ref"] = dd.get_ref_file(to_run)
    to_run["group_orig"] = items
    parallel_split_combine([[to_run]], split_fn, run_parallel,
                           "variantcall_sample", "concat_variant_files",
                           "vrn_file", ["region", "sam_ref", "config"])
    return final_file
Esempio n. 42
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data)
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname")
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}"

    message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts"
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Esempio n. 43
0
def run_stringtie_expression(data):
    """
    estimate expression from Stringtie, using the bcbio datadict
    does not do transcriptome assembly
    """
    bam = dd.get_work_bam(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    sample_name = dd.get_sample_name(data)
    out_dir = os.path.join("stringtie", sample_name)
    isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm")
    gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm")
    assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf"))
    if file_exists(isoform_fpkm) and file_exists(gene_fpkm):
        data = dd.set_cufflinks_dir(data, out_dir)
        data = dd.set_fpkm(data, gene_fpkm)
        data = dd.set_fpkm_isoform(data, isoform_fpkm)
        if "stringtie" in dd.get_transcript_assembler(data):
            dd.get_assembled_gtf(data).append(assembly)
        return data
    with file_transaction(data, out_dir) as tx_out_dir:
        transcript_file = _stringtie_expression(bam, gtf_file, num_cores,
                                                tx_out_dir)
        df = _parse_ballgown(transcript_file)
        _write_fpkms(df, tx_out_dir, sample_name)
    data = dd.set_cufflinks_dir(data, out_dir)
    data = dd.set_fpkm(data, gene_fpkm)
    data = dd.set_fpkm_isoform(data, isoform_fpkm)
    if "stringtie" in dd.get_transcript_assembler(data):
        dd.get_assembled_gtf(data).append(assembly)
    return data
Esempio n. 44
0
def _get_paired_samples(sample, data):
    """Get input sample for each chip bam file."""
    dd.get_phenotype(sample)
    for origin in data:
        if  dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input":
            sample["work_bam_input"] = dd.get_work_bam(origin[0])
            return [sample]
Esempio n. 45
0
def _regions_for_coverage(data, region, ref_file, out_file):
    """Retrieve BED file of regions we need to calculate coverage in.

    Checks for variant region specifications that do not overlap contigs
    (in which case we do not calculate coverage) and regions smaller than
    callable_min_size (in which case we assign everything as callable).
    callable_min_size avoids calculations for small chromosomes we won't
    split on later, saving computation and disk IO.
    """
    variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data)
    ready_region = shared.subset_variant_regions(variant_regions, region, out_file)
    custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0]
    region_size = _get_region_size(ref_file, data, region)
    if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data):
        coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE"
        custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data)
        return custom_file, False
    elif not ready_region:
        get_ref_bedtool(ref_file, data["config"]).saveas(custom_file)
        return custom_file, True
    elif os.path.isfile(ready_region):
        return ready_region, True
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file)
        return custom_file, True
    else:
        custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data)
        return custom_file, variant_regions is None
Esempio n. 46
0
def parallel_calling(data, run_parallel):
    """This is needed only if running methylated veruss hidroxy-methulated"""
    out = []
    for sample in data:
        work_bam = dd.get_work_bam(sample[0])
        with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam:
            chroms = pysam_work_bam.references
            for chrom in chroms:
                new_sample = copy.deepcopy(sample)
                if chrom.find("_") > -1:
                    continue
                new_sample[0]['chr_to_run'] = chrom
                out.append(new_sample)
    out = run_parallel("cpg_calling", out)
    for sample in out:
        phenotype = dd.get_phenotype(sample[0])
        batch = dd.get_batch(sample[0])
        if phenotype == "mC":
            for sample2 in out:
                if batch in dd.get_batch(sample2[0]) and dd.get_phenotype(
                        sample2[0]) == "hmC":
                    if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]:
                        sample[0]["control"] = sample2[0]["cpg_file"]
                        break
    out = run_parallel("cpg_processing", out)
    for sample in data:
        sample[0]["cpg_split"] = []
        sample[0]["hmc_split"] = []
        name = dd.get_sample_name(sample[0])
        for chunck in out:
            if name == dd.get_sample_name(chunck[0]):
                sample[0]["cpg_split"].append(chunck[0]["cpg_file"])
                if "hmc_file" in chunck[0]:
                    sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
Esempio n. 47
0
def _get_original_coverage(data, itype="target"):
    """Back compatible: get existing coverage files if they exist
    """
    work_dir = os.path.join(_sv_workdir(data), "raw")
    work_bam = dd.get_work_bam(data) or dd.get_align_bam(data)
    out = []
    base, _ = _bam_to_outbase(work_bam, work_dir, data)
    target_cnn = "%s.targetcoverage.cnn" % base
    anti_cnn = "%s.antitargetcoverage.cnn" % base
    if os.path.exists(target_cnn) and os.path.exists(anti_cnn):
        out.append({
            "bam": work_bam,
            "file": target_cnn,
            "cnntype": "target",
            "itype": itype,
            "sample": dd.get_sample_name(data)
        })
        out.append({
            "bam": work_bam,
            "file": anti_cnn,
            "cnntype": "antitarget",
            "itype": itype,
            "sample": dd.get_sample_name(data)
        })
    return out
Esempio n. 48
0
def pipeline_summary(data):
    """Provide summary information on processing sample.

    Handles standard and CWL (single QC output) cases.
    """
    data = utils.to_single_data(data)
    if data["analysis"].startswith("wgbs-seq"):
        bismark_bam = dd.get_align_bam(data)
        sorted_bam = bam.sort(bismark_bam, data["config"])
        data = dd.set_align_bam(data, sorted_bam)
        data = dd.set_work_bam(data, bismark_bam)
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    if not work_bam or not work_bam.endswith(".bam"):
        work_bam = None
    if dd.get_ref_file(data):
        if work_bam or (tz.get_in(["config", "algorithm", "kraken"],
                                  data)):  # kraken doesn't need bam
            logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(
                dd.get_algorithm_qc(data))))
            work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data),
                                                 data)
            data["summary"] = _run_qc_tools(work_bam, work_data)
            if (len(dd.get_algorithm_qc(data)) == 1
                    and "output_cwl_keys" in data):
                data["summary"]["qc"] = data["summary"]["qc"].get(
                    dd.get_algorithm_qc(data)[0])
    return [[data]]
Esempio n. 49
0
def _get_files(data):
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                              "align", dd.get_sample_name(data)))
    out_file = "%s-highdepth.bed" % os.path.join(out_dir, utils.splitext_plus(os.path.basename(work_bam))[0])
    stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0]
    return work_bam, out_file, stats_file
Esempio n. 50
0
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts, filter_opts = _get_fgbio_options(
                data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            ref_file = dd.get_ref_file(data)
            cmd = (
                "unset JAVA_HOME && "
                "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                "-i {align_bam} | "
                "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
                "-i /dev/stdin -o /dev/stdout | "
                "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
                "-i /dev/stdin -o /dev/stdout | "
                "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1"
            )
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Esempio n. 51
0
def _combine_qc_samples(samples):
    """Combine split QC analyses into single samples based on BAM files.
    """
    by_bam = collections.defaultdict(list)
    for data in [utils.to_single_data(x) for x in samples]:
        batch = dd.get_batch(data) or dd.get_sample_name(data)
        if not isinstance(batch, (list, tuple)):
            batch = [batch]
        batch = tuple(batch)
        by_bam[(dd.get_align_bam(data)
                or dd.get_work_bam(data), batch)].append(data)
    out = []
    for data_group in by_bam.values():
        data = data_group[0]
        alg_qc = []
        qc = {}
        metrics = {}
        for d in data_group:
            qc.update(dd.get_summary_qc(d))
            metrics.update(dd.get_summary_metrics(d))
            alg_qc.extend(dd.get_algorithm_qc(d))
        data["config"]["algorithm"]["qc"] = alg_qc
        data["summary"]["qc"] = qc
        data["summary"]["metrics"] = metrics
        out.append([data])
    return out
Esempio n. 52
0
def create_peaktable(samples):
    """create a table of peak counts per sample to use with differential peak calling
    """
    data = dd.get_data_from_sample(samples[0])
    peakcounts = []
    out_dir = os.path.join(dd.get_work_dir(data), "consensus")
    out_file = os.path.join(out_dir, "consensus-counts.tsv")
    if dd.get_chip_method(data) == "chip":
        for data in dd.sample_data_iterator(samples):
            peakcounts.append(tz.get_in(("peak_counts"), data))
    elif dd.get_chip_method(data) == "atac":
        for data in dd.sample_data_iterator(samples):
            if bam.is_paired(dd.get_work_bam(data)):
                peakcounts.append(tz.get_in(("peak_counts", "NF"), data))
            else:
                logger.info(f"Creating peak table from full BAM file because "
                            f"{dd.get_work_bam(data)} is single-ended.")
                peakcounts.append(tz.get_in(("peak_counts", "full"), data))
    combined_peaks = count.combine_count_files(peakcounts,
                                               out_file,
                                               ext=".counts")
    new_data = []
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks)
        new_data.append(data)
    new_samples = dd.get_samples_from_datalist(new_data)
    return new_samples
def priority_total_coverage(data):
    """
    calculate coverage at depth 20 in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data

    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with file_transaction(out_file) as tx_out_file:
        cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} "
               "-F \"not unmapped\" "
               "-T 20 {in_bam} -o {tx_out_file}")
        message = "Calculating coverage of {bed_file} regions in {in_bam}"
        do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Esempio n. 54
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                "-F \"not unmapped\" "
                "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Esempio n. 55
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, "
                    "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina")
        if pair_file:
            pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina")
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    samtools = config_utils.get_program("samtools", data["config"])
    cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
           "{pair_file} | {samtools} view -bhS - > {tx_out_file}")

    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Esempio n. 56
0
def cufflinks_assemble(data):
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    out_dir = os.path.join(dd.get_work_dir(data), "assembly")
    num_cores = dd.get_num_cores(data)
    assembled_gtf = cufflinks.assemble(bam_file, ref_file, num_cores, out_dir, data)
    data = dd.set_assembled_gtf(data, assembled_gtf)
    return [[data]]