def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data) or dd.get_align_bam(data)
    out_dir = os.path.join(dd.get_work_dir(data), "align",
                           dd.get_sample_name(data))
    if dd.get_aligner(data) == "star":
        out_dir = os.path.join(
            out_dir,
            "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data)))
    sorted_bam = bam.sort(in_bam,
                          dd.get_config(data),
                          order="queryname",
                          out_dir=safe_makedir(out_dir))
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir,
                                dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts",
                                             dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    resources = config_utils.get_resources("featureCounts", data["config"])
    if resources:
        options = resources.get("options")
        if options:
            cmd += " %s" % " ".join([str(x) for x in options])

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(summary_file,
                                             dd.get_sample_name(data),
                                             data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Example #2
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d)
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    return checkpoints
Example #3
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                        "If BAM is not cleaned for peak calling, can result in downstream errors.")
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
    return [[data]]
Example #4
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info(
                "Warning: When BAM file is given as input, bcbio skips multimappers removal."
                "If BAM is not cleaned for peak calling, can result in downstream errors."
            )
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data),
                                             dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed,
                                        data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                   dd.get_work_bam(data), data)
    return [[data]]
Example #5
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Example #6
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
    if aligner == "bowtie2":
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
        return [[data]]
    return [[data]]
Example #7
0
def _nosort_tobam_cmd(data):
    """Handle converting to BAM for queryname sorted inputs, correcting HD headers.
    """
    if dd.get_aligner(data).startswith("bwa"):
        fix_hd = "(echo '@HD	VN:1.3	SO:queryname' && cat) | "
    else:
        fix_hd = "sed 's/SO:unsorted/SO:queryname/g' | "
    return fix_hd + "{samtools} view -b - -o {out_file}"
Example #8
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
    if aligner == "bowtie2":
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
        return [[data]]
    return [[data]]
Example #9
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
Example #10
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["raw_bam"] = dd.get_work_bam(data)
    if aligner:
        assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2."
        unique_bam = filter_multimappers(dd.get_work_bam(data), data)
        data["work_bam"] = unique_bam
    else:
        logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                    "If BAM is not cleaned for peak calling, can result in downstream errors.")
    return [[data]]
Example #11
0
def _setup_reference_files(data, tx_out_dir):
    """Create a reference directory with fasta and bwa indices.

    GRIDSS requires all files in a single directory, so setup with symlinks.
    """
    aligner = dd.get_aligner(data)
    out_dir = utils.safe_makedir(os.path.join(tx_out_dir, aligner))
    ref_fasta = dd.get_ref_file(data)
    ref_files = ["%s%s" % (utils.splitext_plus(ref_fasta)[0], ext) for ext in [".fa", ".fa.fai", ".dict"]]
    for orig_file in ref_files + tz.get_in(("reference", aligner, "indexes"), data):
        utils.symlink_plus(orig_file, os.path.join(out_dir, os.path.basename(orig_file)))
    return os.path.join(out_dir, os.path.basename(ref_fasta))
Example #12
0
def count(data):
    """
    count reads mapping to genes using featureCounts
    http://subread.sourceforge.net
    """
    in_bam = dd.get_work_bam(data) or dd.get_align_bam(data)
    out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))
    if dd.get_aligner(data) == "star":
        out_dir = os.path.join(out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data)))
    sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir))
    gtf_file = dd.get_gtf_file(data)
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "htseq-count")
    safe_makedir(out_dir)
    count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts"
    summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary"
    if file_exists(count_file) and _is_fixed_count_file(count_file):
        return count_file

    featureCounts = config_utils.get_program("featureCounts", dd.get_config(data))
    paired_flag = _paired_flag(in_bam)
    strand_flag = _strand_flag(data)

    filtered_bam = bam.filter_primary(sorted_bam, data)

    cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} "
           "{paired_flag} {filtered_bam}")

    message = ("Count reads in {tx_count_file} mapping to {gtf_file} using "
               "featureCounts")
    with file_transaction(data, [count_file, summary_file]) as tx_files:
        tx_count_file, tx_summary_file = tx_files
        do.run(cmd.format(**locals()), message.format(**locals()))
    fixed_count_file = _format_count_file(count_file, data)
    fixed_summary_file = _change_sample_name(
        summary_file, dd.get_sample_name(data), data=data)
    shutil.move(fixed_count_file, count_file)
    shutil.move(fixed_summary_file, summary_file)

    return count_file
Example #13
0
def _get_bwa_mem_cmd(data,
                     out_file,
                     ref_file,
                     fastq1,
                     fastq2="",
                     with_hla=False):
    """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing.

    Commands for HLA post-processing:
       base=TEST
       run-HLA $base.hla > $base.hla.top
       cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all
       rm -f $base.hla.HLA*gt
       rm -f $base.hla.HLA*gz
    """
    alt_file = ref_file + ".alt"
    if with_hla:
        bwakit_dir = os.path.dirname(
            os.path.realpath(utils.which("run-bwamem")))
        hla_base = os.path.join(
            utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")),
            os.path.basename(out_file) + ".hla")
        alt_cmd = (
            " | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}"
        )
    else:
        alt_cmd = ""
    if dd.get_aligner(data) == "sentieon-bwa":
        bwa_exe = "sentieon-bwa"
        exports = sentieon.license_export(data)
    else:
        bwa_exe = "bwa"
        exports = ""
    bwa = config_utils.get_program(bwa_exe, data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(data["rgnames"])
    # For UMI runs, pass along consensus tags
    c_tags = "-C" if "umi_bam" in data else ""
    pairing = "-p" if not fastq2 else ""
    # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38
    # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/
    # http://ehc.ac/p/bio-bwa/mailman/message/32268544/
    mem_usage = "-c 250"
    bwa_cmd = (
        "{exports}{bwa} mem {pairing} {c_tags} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' "
        "-v 1 {ref_file} {fastq1} {fastq2} ")
    return (bwa_cmd + alt_cmd).format(**locals())
Example #14
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Example #15
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Example #16
0
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]),
                                             "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"],
                                            "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"],
                                             "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"],
                                               "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        if dd.get_quantify_genome_alignments(data):
            if dd.get_aligner(data).lower() != "star":
                if dd.get_genome_build(data) == "hg38":
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Since this is hg38 we will fall "
                        "back to the decoy method")
                    data = to_single_data(salmon.run_salmon_decoy(data)[0])
                else:
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Falling back to the "
                        "transcriptome-only method.")
                    data = to_single_data(salmon.run_salmon_reads(data)[0])
            else:
                data = to_single_data(salmon.run_salmon_bam(data)[0])
        else:
            data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]),
                                             "abundance.h5")
    return [[data]]
Example #17
0
def run_count(bam_file, dexseq_gff, stranded, out_file, data):
    """
    run dexseq_count on a BAM file
    """
    assert file_exists(bam_file), "%s does not exist." % bam_file
    sort_order = bam._get_sort_order(bam_file, {})
    assert sort_order, "Cannot determine sort order of %s." % bam_file
    strand_flag = _strand_flag(stranded)
    assert strand_flag, "%s is not a valid strandedness value." % stranded
    if not dexseq_gff:
        logger.info(
            "No DEXSeq GFF file was found, skipping exon-level counting.")
        return None
    elif not file_exists(dexseq_gff):
        logger.info("%s was not found, so exon-level counting is being "
                    "skipped." % dexseq_gff)
        return None

    dexseq_count = _dexseq_count_path()
    if not dexseq_count:
        logger.info("DEXseq is not installed, skipping exon-level counting.")
        return None

    if dd.get_aligner(data) == "bwa":
        logger.info(
            "Can't use DEXSeq with bwa alignments, skipping exon-level counting."
        )
        return None

    sort_flag = "name" if sort_order == "queryname" else "pos"
    is_paired = bam.is_paired(bam_file)
    paired_flag = "yes" if is_paired else "no"

    anaconda = os.path.dirname(os.path.realpath(sys.executable))
    r36_python = os.path.join(anaconda, "..", "envs", "r36", "bin", "python")

    if file_exists(out_file):
        return out_file
    cmd = (
        "{r36_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} "
        "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}")
    message = "Counting exon-level counts with %s and %s." % (bam_file,
                                                              dexseq_gff)
    with file_transaction(data, out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Example #18
0
def remove_multimappers(bam_file, data):
    aligner = dd.get_aligner(data)
    if aligner:
        if aligner == "bowtie2":
            filterer = bowtie2.filter_multimappers
        elif aligner == "bwa":
            filterer = bwa.filter_multimappers
        else:
            logger.error("ChIP-seq only supported for bowtie2 and bwa.")
            sys.exit(-1)
        unique_bam = filterer(bam_file, data)
    else:
        unique_bam = bam_file
        logger.warn(
            "When a BAM file is given as input, bcbio skips removal of "
            "multimappers.")
    return unique_bam
Example #19
0
def _setup_reference_files(data, tx_out_dir):
    """Create a reference directory with fasta and bwa indices.

    GRIDSS requires all files in a single directory, so setup with symlinks.
    """
    aligner = dd.get_aligner(data)
    out_dir = utils.safe_makedir(os.path.join(tx_out_dir, aligner))
    ref_fasta = dd.get_ref_file(data)
    ref_files = [
        "%s%s" % (utils.splitext_plus(ref_fasta)[0], ext)
        for ext in [".fa", ".fa.fai", ".dict"]
    ]
    for orig_file in ref_files + tz.get_in(
        ("reference", aligner, "indexes"), data):
        utils.symlink_plus(orig_file,
                           os.path.join(out_dir, os.path.basename(orig_file)))
    return os.path.join(out_dir, os.path.basename(ref_fasta))
Example #20
0
def _check_dedup(data):
    """Check configuration for de-duplication.

    Defaults to no de-duplication for RNA-seq and small RNA, the
    back compatible default. Allow overwriting with explicit
    `mark_duplicates: true` setting.
    Also defaults to false for no alignment inputs.
    """
    if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"] or not dd.get_aligner(data):
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False)
    else:
        dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True)
    if dup_param and isinstance(dup_param, six.string_types):
        logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. "
                    "Using best-practice choice based on input data.")
        dup_param = True
    return dup_param
Example #21
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Since this is hg38 we will fall "
                    "back to the decoy method")
                samples = run_parallel("run_salmon_decoy", samples)
            else:
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Falling back to the "
                    "transcriptome-only method.")
                samples = run_parallel("run_salmon_reads", samples)
        else:
            samples = run_parallel("run_salmon_bam", samples)
    else:
        samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Example #22
0
def run(data):
    if not aligner_supports_fusion(data):
        aligner = dd.get_aligner(data)
        logger.warning("Oncofuse is not supported for the %s aligner, "
                       "skipping. " % aligner)
        return None
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            if file_exists(input_file):
                input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            if file_exists(input_file):
                input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse = config_utils.get_program("oncofuse", config)

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = [oncofuse]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += [input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run(
                    "touch %s && echo '# failed' >> %s" %
                    (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Example #23
0
def run(data):
    if not aligner_supports_fusion(data):
        aligner = dd.get_aligner(data)
        logger.warning("Oncofuse is not supported for the %s aligner, "
                       "skipping. " % aligner)
        return None
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            if file_exists(input_file):
                input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            if file_exists(input_file):
                input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse = config_utils.get_program("oncofuse", config)

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = [oncofuse]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += [input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Example #24
0
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""):
    """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing.

    Commands for HLA post-processing:
       base=TEST
       run-HLA $base.hla > $base.hla.top
       cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all
       rm -f $base.hla.HLA*gt
       rm -f $base.hla.HLA*gz
    """
    alt_file = ref_file + ".alt"
    if utils.file_exists(alt_file) and dd.get_hlacaller(data):
        bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem")))
        hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")),
                                os.path.basename(out_file) + ".hla")
        alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}")
    else:
        alt_cmd = ""
    if dd.get_aligner(data) == "sentieon-bwa":
        bwa_exe = "sentieon-bwa"
        exports = sentieon.license_export(data)
    else:
        bwa_exe = "bwa"
        exports = ""
    bwa = config_utils.get_program(bwa_exe, data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    bwa_resources = config_utils.get_resources("bwa", data["config"])
    bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])])
                  if "options" in bwa_resources else "")
    rg_info = novoalign.get_rg_info(data["rgnames"])
    # For UMI runs, pass along consensus tags
    c_tags = "-C" if "umi_bam" in data else ""
    pairing = "-p" if not fastq2 else ""
    # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38
    # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/
    # http://ehc.ac/p/bio-bwa/mailman/message/32268544/
    mem_usage = "-c 250"
    bwa_cmd = ("{exports}{bwa} mem {pairing} {c_tags} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' "
               "-v 1 {ref_file} {fastq1} {fastq2} ")
    return (bwa_cmd + alt_cmd).format(**locals())
Example #25
0
def get_maxcov_downsample_cl(data, in_pipe=None):
    """Retrieve command line for max coverage downsampling, fitting into bamsormadup output.
    """
    max_cov = _get_maxcov_downsample(data) if dd.get_aligner(data) not in ["snap"] else None
    if max_cov:
        if in_pipe == "bamsormadup":
            prefix = "level=0"
        elif in_pipe == "samtools":
            prefix = "-l 0"
        else:
            prefix = ""
        # Swap over to multiple cores until after testing
        #core_arg = "-t %s" % dd.get_num_cores(data)
        core_arg = ""
        return ("%s | variant - -b %s --mark-as-qc-fail --max-coverage %s"
                % (prefix, core_arg, max_cov))
    else:
        if in_pipe == "bamsormadup":
            prefix = "indexfilename={tx_out_file}.bai"
        else:
            prefix = ""
        return prefix
Example #26
0
def run_count(bam_file, dexseq_gff, stranded, out_file, data):
    """
    run dexseq_count on a BAM file
    """
    assert file_exists(bam_file), "%s does not exist." % bam_file
    sort_order = bam._get_sort_order(bam_file, {})
    assert sort_order, "Cannot determine sort order of %s." % bam_file
    strand_flag = _strand_flag(stranded)
    assert strand_flag, "%s is not a valid strandedness value." % stranded
    if not dexseq_gff:
        logger.info("No DEXSeq GFF file was found, skipping exon-level counting.")
        return None
    elif not file_exists(dexseq_gff):
        logger.info("%s was not found, so exon-level counting is being "
                    "skipped." % dexseq_gff)
        return None

    dexseq_count = _dexseq_count_path()
    if not dexseq_count:
        logger.info("DEXseq is not installed, skipping exon-level counting.")
        return None

    if dd.get_aligner(data) == "bwa":
        logger.info("Can't use DEXSeq with bwa alignments, skipping exon-level counting.")
        return None

    sort_flag = "name" if sort_order == "queryname" else "pos"
    is_paired = bam.is_paired(bam_file)
    paired_flag = "yes" if is_paired else "no"
    bcbio_python = sys.executable

    if file_exists(out_file):
        return out_file
    cmd = ("{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} "
           "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}")
    message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff)
    with file_transaction(data, out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    return out_file
Example #27
0
def get_maxcov_downsample_cl(data, in_pipe=None):
    """Retrieve command line for max coverage downsampling, fitting into bamsormadup output.
    """
    max_cov = _get_maxcov_downsample(data) if dd.get_aligner(data) not in ["snap"] else None
    if max_cov:
        if in_pipe == "bamsormadup":
            prefix = "level=0"
        elif in_pipe == "samtools":
            prefix = "-l 0"
        else:
            prefix = ""
        # Swap over to multiple cores until after testing
        #core_arg = "-t %s" % dd.get_num_cores(data)
        core_arg = ""
        return ("%s | variant - -b %s --mark-as-qc-fail --max-coverage %s"
                % (prefix, core_arg, max_cov))
    else:
        if in_pipe == "bamsormadup":
            prefix = "indexfilename={tx_out_file}.bai"
        else:
            prefix = ""
        return prefix
Example #28
0
def aligner_supports_fusion(data):
    SUPPORTED_ALIGNERS = ["tophat2", "tophat", "star"]
    aligner = dd.get_aligner(data)
    return aligner and aligner.lower() in SUPPORTED_ALIGNERS
Example #29
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(
                items[0]),
                                                   region,
                                                   out_file,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts = " ".join(
                    _vardict_options_from_config(items, config, out_file,
                                                 target))
                coverage_interval = utils.get_in(
                    config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(
                    items[0]) > 5000 else ""
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                        os.path.join(os.path.dirname(sys.executable), "py"))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(
                    utils.Rscript_cmd())
                cmd = (
                    "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| {strandbias} "
                    "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "{freq_filter} "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Example #30
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError("Single fastq input for UMI processing; fgbio needs paired reads: %s" %
                                 dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean",
                                                           dd.get_sample_name(data)))
                out_file = os.path.join(work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = _link_bam_file(fastq1, os.path.join(dd.get_work_dir(data), "prealign",
                                                          dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #31
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon"
             or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
Example #32
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #33
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region,
                                                   out_file, do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region, out_file)
                    return ann_file
                vcffilter = config_utils.get_program("vcffilter", config)
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                strandbias = "testsomatic.R"
                var2vcf = "var2vcf_paired.pl"
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts = " ".join(_vardict_options_from_config(items, config, out_file, target))
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                                      "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                                      "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                                      os.path.join(os.path.dirname(sys.executable), "py"))
                    freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                                   "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" %
                                   (os.path.join(os.path.dirname(sys.executable), "py"),
                                     0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd())
                cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                       "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                       "| {strandbias} "
                       "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                       "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                       "{freq_filter} "
                       "{somatic_filter} | {fix_ambig} | {remove_dup} | {vcfstreamsort} "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config)
                if assoc_files.get("dbsnp") else out_file)
    return out_file
Example #34
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            vrs = bedutils.population_variant_regions(items)
            target = shared.subset_variant_regions(vrs,
                                                   region,
                                                   out_file,
                                                   items=items,
                                                   do_merge=True)
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not _is_bed_file(target):
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                if not paired.normal_bam:
                    ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                                   assoc_files, region,
                                                   out_file)
                    return ann_file
                vardict = get_vardict_command(items[0])
                vcfstreamsort = config_utils.get_program(
                    "vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(
                    utils.get_in(config, ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                # merge bed file regions as amplicon VarDict is only supported in single sample mode
                opts, var2vcf_opts = _vardict_options_from_config(
                    items, config, out_file, target)
                fix_ambig_ref = vcfutils.fix_ambiguous_cl()
                fix_ambig_alt = vcfutils.fix_ambiguous_cl(5)
                remove_dup = vcfutils.remove_dup_cl()
                if any("vardict_somatic_filter" in tz.get_in((
                        "config", "algorithm", "tools_off"), data, [])
                       for data in items):
                    somatic_filter = ""
                    freq_filter = ""
                else:
                    var2vcf_opts += " -M "  # this makes VarDict soft filter non-differential variants
                    somatic_filter = (
                        "| sed 's/\\\\.*Somatic\\\\/Somatic/' "
                        "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' "
                        """| %s -c 'from bcbio.variation import freebayes; """
                        """freebayes.call_somatic("%s", "%s")' """ %
                        (sys.executable, paired.tumor_name,
                         paired.normal_name))
                    freq_filter = (
                        "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null "
                        "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' "
                        "| %s "
                        "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'"
                        % (os.path.join(os.path.dirname(sys.executable),
                                        "py"), _lowfreq_linear_filter(0, True),
                           os.path.join(os.path.dirname(sys.executable), "py"),
                           0, dd.get_aligner(paired.tumor_data)))
                jvm_opts = _get_jvm_opts(items[0], tx_out_file)
                py_cl = os.path.join(utils.get_bcbio_bin(), "py")
                setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports())
                contig_cl = vcfutils.add_contig_to_header_cl(
                    ref_file, tx_out_file)
                cmd = (
                    "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} "
                    "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                    "| awk 'NF>=48' | testsomatic.R "
                    "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} "
                    "-N \"{paired.tumor_name}|{paired.normal_name}\" "
                    "| {contig_cl} {freq_filter} "
                    "| bcftools filter -i 'QUAL >= 0' "
                    "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Genotyping with VarDict: Inference", {})
    return out_file
Example #35
0
def aligner_supports_fusion(data):
    SUPPORTED_ALIGNERS = ["tophat2", "tophat", "star"]
    aligner = dd.get_aligner(data).lower()
    return aligner in SUPPORTED_ALIGNERS
Example #36
0
 def _skip_duplicates(data):
     return (dd.get_coverage_interval(data) == "amplicon" or
             (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
def calling(data):
    if dd.get_aligner(data) == "bismark":
        data = _bismark_calling(data)
    if dd.get_aligner(data) == "bsmap":
        data = _bsmap_calling(data)
    return [[data]]