예제 #1
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #2
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
예제 #3
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            # Recommended options from 1000 genomes low-complexity evaluation
            # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
            opts += " --min-repeat-entropy 1 --experimental-gls"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
예제 #4
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #5
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            opts += " -f {}".format(ref_file)
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = ("{freebayes} --pooled-discrete --pvar 0.7"
                  " --genotype-qualities {opts} {paired.tumor_bam}"
                  " {paired.normal_bam} | {vcfsamplediff} -s VT"
                  " {paired.normal_name} {paired.tumor_name}"
                  " - {compress_cmd} >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
예제 #6
0
파일: freebayes.py 프로젝트: zeneofa/bcbio
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError(
                    "Require both tumor and normal BAM files for FreeBayes cancer calling"
                )

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(
                _freebayes_options_from_config(items, config, out_file,
                                               region))
            opts += " -f {}".format(ref_file)
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(
                    utils.get_in(paired.tumor_config,
                                 ("algorithm", "min_allele_fraction"),
                                 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one

            # NOTE: -s in vcfsamplediff (strict checking: i.e., require no
            # reads in the germline to call somatic) is not used as it is
            # too stringent
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = (
                "{freebayes} --pooled-discrete --genotype-qualities "
                "{opts} {paired.tumor_bam} {paired.normal_bam} "
                "| {vcffilter} -f 'QUAL > 1' -s "
                "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - "
                "{compress_cmd} >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()),
                   "Genotyping paired variants with FreeBayes", {})
    fix_somatic_calls(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
예제 #7
0
def _run_qsnp_paired(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Detect somatic mutations with qSNP.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        out_file = out_file.replace(".gz", "")
        with file_transaction(config, out_file) as tx_out_file:
            with tx_tmpdir() as tmpdir:
                with utils.chdir(tmpdir):
                    paired = get_paired_bams(align_bams, items)
                    qsnp = config_utils.get_program("qsnp", config)
                    resources = config_utils.get_resources("qsnp", config)
                    mem = " ".join(resources.get("jvm_opts", ["-Xms750m -Xmx4g"]))
                    qsnp_log = os.path.join(tmpdir, "qsnp.log")
                    qsnp_init = os.path.join(tmpdir, "qsnp.ini")
                    if region:
                        paired = _create_bam_region(paired, region, tmpdir)
                    _create_input(paired, tx_out_file, ref_file, assoc_files['dbsnp'], qsnp_init)
                    cl = ("{qsnp} {mem} -i {qsnp_init} -log {qsnp_log}")
                    do.run(cl.format(**locals()), "Genotyping paired variants with Qsnp", {})
        out_file = _filter_vcf(out_file)
        out_file = bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #8
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config["algorithm"],
                                                           out_file, region))
            cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {vcfallelicprimitives} | {vcfstreamsort} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
        clean_vcf_output(out_file, _clean_freebayes_output, "nodups")
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
예제 #9
0
def shared_variantcall(call_fn, name, align_bams, ref_file, config,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    broad_runner = broad.runner_from_config(config)
    for x in align_bams:
        broad_runner.run_fn("picard_index", x)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(name=name,
            region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, config, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp,
                                               ref_file, config)
    return ann_file
예제 #10
0
def run_freebayes(align_bams,
                  items,
                  ref_file,
                  assoc_files,
                  region=None,
                  out_file=None):
    """Detect SNPs and indels with FreeBayes.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            cl = [
                config_utils.get_program("freebayes",
                                         config), "-v", tx_out_file, "-f",
                ref_file, "--use-mapping-quality", "--pvar", "0.7"
            ]
            for align_bam in align_bams:
                broad_runner.run_fn("picard_index", align_bam)
                cl += ["-b", align_bam]
            cl += _freebayes_options_from_config(config["algorithm"], out_file,
                                                 region)
            do.run(cl, "Genotyping with FreeBayes", {})
        _clean_freebayes_output(out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
예제 #11
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.info("Genotyping with {name}: {region} {fname}".format(
            name=name, region=region, fname=os.path.basename(align_bams[0])))
        for x in align_bams:
            bam.index(x, config)
        variant_regions = config["algorithm"].get("variant_regions", None)
        target_regions = subset_variant_regions(variant_regions, region, out_file)
        if ((variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions))
              or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
            vcfutils.write_empty_vcf(out_file)
        else:
            with file_transaction(out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"],
                                               ref_file, config)
    return ann_file
예제 #12
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            # Recommended options from 1000 genomes low-complexity evaluation
            # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
            opts += " --min-repeat-entropy 1 --experimental-gls"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("{freebayes} -f {ref_file} {input_bams} {opts} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #13
0
def postcall_annotate(in_file, bam_file, ref_file, vrn_files, config):
    """Perform post-call annotation of FreeBayes calls in preparation for filtering.
    """
    #out_file = _check_file_gatk_merge(in_file)
    out_file = annotation.annotate_nongatk_vcf(in_file, bam_file, vrn_files.dbsnp,
                                               ref_file, config)
    return out_file
예제 #14
0
def shared_variantcall(call_fn, name, align_bams, ref_file, items,
                       assoc_files, region=None, out_file=None):
    """Provide base functionality for prepping and indexing for variant calling.
    """
    config = items[0]["config"]
    if out_file is None:
        if vcfutils.is_paired_analysis(align_bams, items):
            out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"]
        else:
            out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        logger.debug("Genotyping with {name}: {region} {fname}".format(
              name=name, region=region, fname=os.path.basename(align_bams[0])))
        variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0])
        target_regions = subset_variant_regions(variant_regions, region, out_file, items=items)
        if (variant_regions is not None and isinstance(target_regions, basestring)
              and not os.path.isfile(target_regions)):
            vcfutils.write_empty_vcf(out_file, config)
        else:
            with file_transaction(config, out_file) as tx_out_file:
                call_fn(align_bams, ref_file, items, target_regions,
                        tx_out_file)
    if out_file.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #15
0
def _run_freebayes_paired(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[
                        x for x in [paired.tumor_name, paired.normal_name] if x
                    ])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = (
                    "{freebayes} -f {ref_file} {opts} "
                    "{paired.tumor_bam} {paired.normal_bam} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                    "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()),
                       "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #16
0
파일: freebayes.py 프로젝트: jme9/wabio
def postcall_annotate(in_file, bam_file, ref_file, vrn_files, config):
    """Perform post-call annotation of FreeBayes calls in preparation for filtering.
    """
    #out_file = _check_file_gatk_merge(in_file)
    out_file = annotation.annotate_nongatk_vcf(in_file, bam_file,
                                               vrn_files.dbsnp, ref_file,
                                               config)
    return out_file
예제 #17
0
def _run_freebayes_caller(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None,
                          somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts, no_target_regions = _freebayes_options_from_config(
                items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(
                    tx_out_file,
                    config,
                    samples=[dd.get_sample_name(d) for d in items])
            else:
                opts = " ".join(opts)
                # Recommended options from 1000 genomes low-complexity evaluation
                # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
                opts += " --min-repeat-entropy 1"
                # Remove partial observations, which cause a preference for heterozygote calls
                # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765
                opts += " --no-partial-observations"
                if somatic:
                    opts = _add_somatic_opts(opts, somatic)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                # For multi-sample outputs, ensure consistent order
                samples = (
                    "-s" +
                    ",".join([dd.get_sample_name(d)
                              for d in items])) if len(items) > 1 else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cmd = (
                    "{freebayes} -f {ref_file} {opts} {input_bams} "
                    """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                    "| {fix_ambig} | bcftools view {samples} -a - | "
                    "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                    "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                    "vt normalize -n -r {ref_file} -q - | vcfuniqalleles | vt uniq - 2> /dev/null "
                    "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #18
0
파일: freebayes.py 프로젝트: rwness/bcbb
def postcall_annotate(in_file, ref_file, vrn_files, config):
    """Perform post-call annotation of FreeBayes calls in preparation for filtering.
    """
    #out_file = _check_file_gatk_merge(in_file)
    out_file = annotation.annotate_nongatk_vcf(in_file, vrn_files.dbsnp,
                                               ref_file, config)
    #filters = ["QUAL < 20.0", "DP < 5"]
    #out_file = genotype.variant_filtration_with_exp(broad.runner_from_config(config),
    #                                                out_file, ref_file, "", filters)
    return out_file
예제 #19
0
파일: freebayes.py 프로젝트: gturco/bcbb
def postcall_annotate(in_file, ref_file, vrn_files, config):
    """Perform post-call annotation of FreeBayes calls in preparation for filtering.
    """
    #out_file = _check_file_gatk_merge(in_file)
    out_file = annotation.annotate_nongatk_vcf(in_file, vrn_files.dbsnp,
                                               ref_file, config)
    #filters = ["QUAL < 20.0", "DP < 5"]
    #out_file = genotype.variant_filtration_with_exp(broad.runner_from_config(config),
    #                                                out_file, ref_file, "", filters)
    return out_file
예제 #20
0
def finalize_genotyper(call_file, bam_file, ref_file, config):
    """Perform SNP genotyping and analysis.
    """
    vrn_files = configured_vrn_files(config, ref_file)
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if variantcaller in ["freebayes", "cortex", "samtools", "gatk-haplotype", "varscan"]:
        call_file = annotation.annotate_nongatk_vcf(call_file, bam_file, vrn_files.dbsnp,
                                                    ref_file, config)
    filter_snp = variant_filtration(call_file, ref_file, vrn_files, config)
    return filter_snp
예제 #21
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(
                tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = (
                "{perl_exports} && "
                "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #22
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = config_utils.get_program("vardict", config)
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
                vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                cmd = ("{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #23
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                return _run_freebayes_caller(align_bams, items, ref_file,
                                             assoc_files, region, out_file)
                #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                  "min_allele_fraction"), 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            opts += " --min-repeat-entropy 1 --experimental-gls"
            # Recommended settings for cancer calling
            # https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
            opts += " --pooled-discrete --genotype-qualities --report-genotype-likelihood-max"
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one
            # NOTE: -s in vcfsamplediff (strict checking: i.e., require no
            # reads in the germline to call somatic) is not used as it is
            # too stringent
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = ("{freebayes} -f {ref_file} {opts} "
                  "{paired.tumor_bam} {paired.normal_bam} "
                  "| {vcffilter} -f 'QUAL > 5' -s "
                  "| {vcfallelicprimitives} | {vcfstreamsort} "
                  "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - "
                  "{compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    fix_somatic_calls(out_file, config)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #24
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file):
    if not utils.file_exists(out_file):
        work_dir = "%s-work" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        out_file = annotation.annotate_nongatk_vcf(raw_file, align_bams, assoc_files.get("dbsnp"),
                                                   ref_file, items[0], out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
예제 #25
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            scalpel = config_utils.get_program("scalpel", config)
            vcfallelicprimitives = config_utils.get_program(
                "vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = os.path.dirname(tx_out_file)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            cmd = (
                "{scalpel} --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            # first run into temp folder
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants." + min_cov + "x.indel.vcf"),
                config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' | "
                "{vcfallelicprimitives} | {vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #26
0
def _run_vardict_paired(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(
            align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = config_utils.get_program("vardict", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            vcfallelicprimitives = config_utils.get_program(
                "vcfallelicprimitives", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_somatic.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(
                utils.get_in(config,
                             ("algorithm", "min_allele_fraction"), 10)) / 100.0
            opts = " ".join(
                _vardict_options_from_config(items, config, out_file, region))
            coverage_interval = utils.get_in(
                config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = (
                "{vardict} -G {ref_file} -f {freq} "
                "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                "| {strandbias} "
                "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()),
                   "Genotyping with VarDict: Inference", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #27
0
def _run_vardict_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with VarDict.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            num_bams = len(align_bams)
            sample_vcf_names = []  # for individual sample names, given batch calling may be required
            for bamfile, item in itertools.izip(align_bams, items):
                # prepare commands
                vardict = config_utils.get_program("vardict", config)
                strandbias = "teststrandbias.R"
                var2vcf = "var2vcf_valid.pl"
                opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
                vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
                coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
                # for deep targeted panels, require 50 worth of coverage
                var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                sample = item["name"][1]
                cmd = ("{vardict} -G {ref_file} -f {freq} "
                       "-N {sample} -b {bamfile} {opts} "
                       "| {strandbias}"
                       "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} "
                       "| {fix_ambig} | {vcfstreamsort} {compress_cmd}")
                if num_bams > 1:
                    temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1]
                    tmp_out = temp_file_prefix + ".temp.vcf"
                    tmp_out += ".gz" if out_file.endswith("gz") else ""
                    sample_vcf_names.append(tmp_out)
                    with file_transaction(item, tmp_out) as tx_tmp_file:
                        cmd += " > {tx_tmp_file}"
                        do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
                else:
                    cmd += " > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
            if num_bams > 1:
                # N.B. merge_variant_files wants region in 1-based end-inclusive
                # coordinates. Thus use bamprep.region_to_gatk
                vcfutils.merge_variant_files(orig_files=sample_vcf_names,
                                             out_file=tx_out_file, ref_file=ref_file,
                                             config=config, region=bamprep.region_to_gatk(region))
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #28
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples. Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                return _run_freebayes_caller(align_bams, items, ref_file,
                                             assoc_files, region, out_file)
                #raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency
                # FreeBayes defaults to 20%, but use 10% by default for the
                # tumor case
                min_af = float(utils.get_in(paired.tumor_config, ("algorithm",
                                                                  "min_allele_fraction"), 10)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            opts += " --min-repeat-entropy 1 --experimental-gls"
            # Recommended settings for cancer calling
            opts += (" --pooled-discrete --pooled-continuous --genotype-qualities "
                     "--report-genotype-likelihood-max --allele-balance-priors-off")
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cl = ("{freebayes} -f {ref_file} {opts} "
                  "{paired.tumor_bam} {paired.normal_bam} "
                  "| vcffilter -f 'QUAL > 5' -s "
                  "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                  "| {fix_ambig} | vcfallelicprimitives --keep-info --keep-geno "
                  "| vt normalize -q -r {ref_file} - "
                  "{compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #29
0
def finalize_genotyper(call_file, bam_file, ref_file, config):
    """Perform SNP genotyping and analysis.
    """
    vrn_files = configured_vrn_files(config, ref_file)
    variantcaller = config["algorithm"].get("variantcaller", "gatk")
    if variantcaller in [
            "freebayes", "cortex", "samtools", "gatk-haplotype", "varscan"
    ]:
        call_file = annotation.annotate_nongatk_vcf(call_file, bam_file,
                                                    vrn_files.dbsnp, ref_file,
                                                    config)
    filter_snp = variant_filtration(call_file, ref_file, vrn_files, config)
    return filter_snp
예제 #30
0
def call_variations(data, args):
    """
    Run BisSNP tool
    """
    safe_makedir("bissnp")
    sample = data['name']
    workdir = op.abspath(safe_makedir(op.join("bissnp", sample)))
    counts_file = _count_covars(data['final_bam'], sample, workdir, args.snp, args.reference, data['config'])
    recal_bam = _recal_BQ_score(data['final_bam'], sample, workdir, counts_file, args.reference, data['config'])
    cpg, snp = _call_vcf(recal_bam, sample, workdir, args.reference, data['config'])
    sort_snp = _correct_vcf(snp)
    snp = annotation.annotate_nongatk_vcf(sort_snp, [data['final_bam']],
                                          args.snp,
                                          args.reference, data['config'])
    return data
예제 #31
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            scalpel = config_utils.get_program("scalpel", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            tmp_path = os.path.dirname(tx_out_file)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --ref {}".format(ref_file)
            opts += " --dir %s" % tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            cl = "{scalpel} --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}"
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # somatic
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "main/somatic." + min_cov + "x.indel.vcf"), config
            )
            # common
            scalpel_tmp_file_common = bgzip_and_index(
                os.path.join(tmp_path, "main/common." + min_cov + "x.indel.vcf"), config
            )
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            cl2 = (
                "vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                " sed 's/sample_name/{paired.tumor_name}/g' | "
                "{vcfstreamsort} {compress_cmd} > {tx_out_file}"
            )
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config)
    return ann_file
예제 #32
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = config_utils.get_program("vardict", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_paired.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
            # merge bed file regions as amplicon VarDict is only supported in single sample mode
            opts = " ".join(_vardict_options_from_config(items, config, out_file, region, do_merge=True))
            coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, [])
                   for data in items):
                somatic_filter = ""
            else:
                somatic_filter = ("| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" %
                                  os.path.join(os.path.dirname(sys.executable), "py"))
            cmd = ("{vardict} -G {ref_file} -f {freq} "
                   "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                   "| {strandbias} "
                   "| {var2vcf} -M -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                   "{somatic_filter} | {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #33
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None):

    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """

    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]

    paired = get_paired_bams(align_bams, items)

    vcfsamplediff = config_utils.get_program("vcfsamplediff", config)

    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]

    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:

            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config["algorithm"], out_file, region))
            opts += " -f {}".format(ref_file)

            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one

            cl = (
                "{freebayes} --pooled-discrete --pvar 0.7"
                " --genotype-qualities {opts} {paired.tumor_bam}"
                " {paired.normal_bam} | {vcfsamplediff} -s VT"
                " {paired.normal_sample_name} {paired.tumor_sample_name}"
                " - >  {tx_out_file}"
            )

            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)

            cl = cl.format(**locals())

            do.run(cl, "Genotyping paired variants with FreeBayes", {})

        clean_vcf_output(out_file, _clean_freebayes_output, "nodups")

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config)
    return ann_file
예제 #34
0
def _run_tumor_pindel_caller(align_bams,
                             items,
                             ref_file,
                             assoc_files,
                             region=None,
                             out_file=None):
    """Detect indels with pindel in tumor/[normal] analysis.
    Only attempts to detect small insertion/deletions and not larger structural events.
    :param align_bam: (list) bam files
    :param items: (dict) information from yaml
    :param ref_file: (str) genome in fasta format
    :param assoc_file: (dict) files for annotation
    :param region: (str or tupple) region to analyze
    :param out_file: (str) final vcf file
    :returns: (str) final vcf file
    """
    config = items[0]["config"]
    paired = get_paired_bams(align_bams, items)
    if out_file is None:
        out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0]
    paired_bam = [paired.tumor_bam]
    paired_name = [paired.tumor_name]
    if paired.normal_bam:
        paired_bam.append(paired.normal_bam)
        paired_name.append(paired.normal_name)
    if not utils.file_exists(out_file):
        with tx_tmpdir(config) as tmp_path:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            root_pindel = os.path.join(tmp_path, "pindelroot")
            pindel = config_utils.get_program("pindel", config)
            opts = _pindel_options(items, config, out_file, region, tmp_path)
            tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path,
                                          config)
            cmd = (
                "{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " +
                "{opts} --report_inversions false --report_duplications false "
                "--report_long_insertions false --report_breakpoints false "
                "--report_interchromosomal_events false "
                "--max_range_index 2")
            do.run(cmd.format(**locals()), "Genotyping with pindel", {})
            out_file = _create_vcf(root_pindel, out_file, ref_file, items,
                                   paired)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #35
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes for paired tumor/normal samples.

    Sources of options for FreeBayes:
    mailing list: https://groups.google.com/d/msg/freebayes/dTWBtLyM4Vs/HAK_ZhJHguMJ
    mailing list: https://groups.google.com/forum/#!msg/freebayes/LLH7ZfZlVNs/63FdD31rrfEJ
    speedseq: https://github.com/cc2qe/speedseq/blob/e6729aa2589eca4e3a946f398c1a2bdc15a7300d/bin/speedseq#L916
    sga/freebayes: https://github.com/jts/sga-extra/blob/7e28caf71e8107b697f9be7162050e4fa259694b/
                   sga_generate_varcall_makefile.pl#L299
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for FreeBayes paired calling and filtering"

            freebayes = config_utils.get_program("freebayes", config)
            opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(tx_out_file, config,
                                         samples=[x for x in [paired.tumor_name, paired.normal_name] if x])
            else:
                opts = " ".join(opts)
                opts += " --min-repeat-entropy 1"
                opts += " --no-partial-observations"
                opts = _add_somatic_opts(opts, paired)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cl = ("{freebayes} -f {ref_file} {opts} "
                      "{paired.tumor_bam} {paired.normal_bam} "
                      """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                      "| {py_cl} -x 'bcbio.variation.freebayes.call_somatic(x)' "
                      "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                      "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                      "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                      "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                      "{compress_cmd} > {tx_out_file}")
                do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #36
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            cl = [config_utils.get_program("freebayes", config), "-v", tx_out_file, "-f", ref_file, "--pvar", "0.7"]
            for align_bam in align_bams:
                bam.index(align_bam, config)
                cl += ["-b", align_bam]
            cl += _freebayes_options_from_config(items, config["algorithm"], out_file, region)
            do.run(cl, "Genotyping with FreeBayes", {})
        clean_vcf_output(out_file, _clean_freebayes_output, "nodups")
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files["dbsnp"], ref_file, config)
    return ann_file
예제 #37
0
def _run_freebayes_caller(align_bams,
                          items,
                          ref_file,
                          assoc_files,
                          region=None,
                          out_file=None,
                          somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(
                _freebayes_options_from_config(items, config, out_file,
                                               region))
            # Recommended options from 1000 genomes low-complexity evaluation
            # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
            opts += " --min-repeat-entropy 1"
            if somatic:
                opts = _add_somatic_opts(opts, somatic)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cmd = (
                "{freebayes} -f {ref_file} {opts} {input_bams} | "
                "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | "
                "bcftools view -a - 2> /dev/null | "
                "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | "
                "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles "
                "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #38
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")

            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            opts += " -f {}".format(ref_file)
            if "--min-alternate-fraction" not in opts and "-F" not in opts:
                # add minimum reportable allele frequency, for which FreeBayes defaults to 20
                min_af = float(utils.get_in(paired.tumor_config, ("algorithm", 
                                                                  "min_allele_fraction"),20)) / 100.0
                opts += " --min-alternate-fraction %s" % min_af
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one

            # NOTE: -s in vcfsamplediff (strict checking: i.e., require no
            # reads in the germline to call somatic) is not used as it is
            # too stringent
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl = ("{freebayes} --pooled-discrete --genotype-qualities "
                  "{opts} {paired.tumor_bam} {paired.normal_bam} "
                  "| {vcffilter} -f 'QUAL > 1' -s "
                  "| {vcfsamplediff} VT {paired.normal_name} {paired.tumor_name} - "
                  "{compress_cmd} >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
예제 #39
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None, somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts, no_target_regions = _freebayes_options_from_config(items, config, out_file, region)
            if no_target_regions:
                vcfutils.write_empty_vcf(tx_out_file, config, samples=[dd.get_sample_name(d) for d in items])
            else:
                opts = " ".join(opts)
                # Recommended options from 1000 genomes low-complexity evaluation
                # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
                opts += " --min-repeat-entropy 1"
                # Remove partial observations, which cause a preference for heterozygote calls
                # https://github.com/ekg/freebayes/issues/234#issuecomment-205331765
                opts += " --no-partial-observations"
                if somatic:
                    opts = _add_somatic_opts(opts, somatic)
                compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
                fix_ambig = vcfutils.fix_ambiguous_cl()
                clean_fmt_cmd = _clean_freebayes_fmt_cl()
                py_cl = os.path.join(os.path.dirname(sys.executable), "py")
                cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} "
                       """| bcftools filter -i 'ALT="<*>" || QUAL > 5' """
                       "| {fix_ambig} | {clean_fmt_cmd} bcftools view -a - | "
                       "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                       "vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort | "
                       "vt normalize -n -r {ref_file} -q - | vcfuniqalleles "
                       "{compress_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #40
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = ("{perl_exports} && "
                   "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ")
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                   r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                   "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #41
0
def _run_vardict_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect variants with Vardict.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_vardict_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            vardict = config_utils.get_program("vardict", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            strandbias = "testsomatic.R"
            var2vcf = "var2vcf_somatic.pl"
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0
            opts = " ".join(_vardict_options_from_config(items, config, out_file, region))
            coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome")
            # for deep targeted panels, require 50 worth of coverage
            var2vcf_opts = " -v 50 " if coverage_interval == "regional" else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("{vardict} -G {ref_file} -f {freq} "
                   "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} "
                   "| {strandbias} "
                   "| {var2vcf} -N \"{paired.tumor_name}|{paired.normal_name}\" -f {freq} {var2vcf_opts} "
                   "| {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
예제 #42
0
def _run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files,
                             region=None, out_file=None):
    """Detect indels with pindel in tumor/[normal] analysis.
    Only attempts to detect small insertion/deletions and not larger structural events.
    :param align_bam: (list) bam files
    :param items: (dict) information from yaml
    :param ref_file: (str) genome in fasta format
    :param assoc_file: (dict) files for annotation
    :param region: (str or tupple) region to analyze
    :param out_file: (str) final vcf file
    :returns: (str) final vcf file
    """
    config = items[0]["config"]
    paired = get_paired_bams(align_bams, items)
    if out_file is None:
        out_file = "%s-indels.vcf" % os.path.splitext(align_bams[0])[0]
    paired_bam = [paired.tumor_bam]
    paired_name = [paired.tumor_name]
    if paired.normal_bam:
        paired_bam.append(paired.normal_bam)
        paired_name.append(paired.normal_name)
    if not utils.file_exists(out_file):
        with tx_tmpdir(config) as tmp_path:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            root_pindel = os.path.join(tmp_path, "pindelroot")
            pindel = config_utils.get_program("pindel", config)
            opts = _pindel_options(items, config, out_file, region, tmp_path)
            tmp_input = _create_tmp_input(paired_bam, paired_name, tmp_path, config)
            cmd = ("{pindel} -f {ref_file} -i {tmp_input} -o {root_pindel} " +
                   "{opts} --report_inversions false --report_duplications false "
                   "--report_long_insertions false --report_breakpoints false "
                   "--report_interchromosomal_events false "
                   "--max_range_index 2")
            do.run(cmd.format(**locals()), "Genotyping with pindel", {})
            out_file = _create_vcf(root_pindel, out_file, ref_file,
                                   items, paired)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #43
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcffilter = config_utils.get_program("vcffilter", config)
            scalpel = config_utils.get_program("scalpel", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            tmp_path = os.path.dirname(tx_out_file)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --ref {}".format(ref_file)
            opts += " --dir %s" % tmp_path
            min_cov = "5"  # minimum coverage (default 5)
            opts += " --mincov %s" % min_cov
            cl = ("{scalpel} --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            scalpel_tmp_file = os.path.join(tmp_path, "main/somatic." + min_cov + "x.indel.vcf")
            scalpel_tmp_file_common = os.path.join(tmp_path, "main/common." + min_cov + "x.indel.vcf")
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            cl2 = ("cat {scalpel_tmp_file} <(grep -vE '^#' {scalpel_tmp_file_common} | "
                   "sed 's/PASS/REJECT/g') | sed 's/sample_name/{paired.tumor_name}/g' | "
                   "{vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file
예제 #44
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, config)
            scalpel = config_utils.get_program("scalpel", config)
            vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config)
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = os.path.dirname(tx_out_file)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            cmd = ("{scalpel} --single {opts} --ref {ref_file} --bam {input_bams} ")
            # first run into temp folder
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants." + min_cov + "x.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | sed 's/sample_name/{sample_name_str}/g' | "
                   "{vcfallelicprimitives} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #45
0
def run_freebayes(align_bams, items, ref_file, assoc_files, region=None,
                  out_file=None):
    """Detect SNPs and indels with FreeBayes.
    """
    config = items[0]["config"]
    broad_runner = broad.runner_from_config(config)
    if out_file is None:
        out_file = "%s-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            cl = [config_utils.get_program("freebayes", config),
                  "-v", tx_out_file, "-f", ref_file,
                  "--use-mapping-quality", "--pvar", "0.7"]
            for align_bam in align_bams:
                broad_runner.run_fn("picard_index", align_bam)
                cl += ["-b", align_bam]
            cl += _freebayes_options_from_config(config["algorithm"], out_file, region)
            do.run(cl, "Genotyping with FreeBayes", {})
        _clean_freebayes_output(out_file)
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.dbsnp,
                                               ref_file, config)
    return ann_file
예제 #46
0
def _run_freebayes_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None, somatic=None):
    """Detect SNPs and indels with FreeBayes.

    Performs post-filtering to remove very low quality variants which
    can cause issues feeding into GATK. Breaks variants into individual
    allelic primitives for analysis and evaluation.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            freebayes = config_utils.get_program("freebayes", config)
            vcffilter = config_utils.get_program("vcffilter", config)
            input_bams = " ".join("-b %s" % x for x in align_bams)
            opts = " ".join(_freebayes_options_from_config(items, config, out_file, region))
            # Recommended options from 1000 genomes low-complexity evaluation
            # https://groups.google.com/d/msg/freebayes/GvxIzjcpbas/1G6e3ArxQ4cJ
            opts += " --min-repeat-entropy 1"
            if somatic:
                opts = _add_somatic_opts(opts, somatic)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            fix_ambig = vcfutils.fix_ambiguous_cl()
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cmd = ("{freebayes} -f {ref_file} {opts} {input_bams} | "
                   "{vcffilter} -f 'QUAL > 5' -s | {fix_ambig} | "
                   "bcftools view -a - 2> /dev/null | "
                   "{py_cl} -x 'bcbio.variation.freebayes.remove_missingalt(x)' | "
                   "vcfallelicprimitives --keep-geno | vcffixup - | vcfstreamsort | "
                   "vt normalize -n -r {ref_file} -q - 2> /dev/null | vcfuniqalleles "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "Genotyping with FreeBayes", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
예제 #47
0
def _run_freebayes_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect SNPs and indels with FreeBayes.

    This is used for paired tumor / normal samples.
    """
    if out_file is None:
        out_file = "%s-paired-variants.vcf" % os.path.splitext(align_bams[0])[0]
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                raise ValueError("Require both tumor and normal BAM files for FreeBayes cancer calling")
            config = items[0]["config"]
            vcfsamplediff = config_utils.get_program("vcfsamplediff", config)
            freebayes = config_utils.get_program("freebayes", config)
            opts = " ".join(
                _freebayes_options_from_config(items, config["algorithm"],
                                               out_file, region))
            opts += " -f {}".format(ref_file)
            # NOTE: The first sample name in the vcfsamplediff call is
            # the one supposed to be the *germline* one
            cl = ("{freebayes} --pooled-discrete --pvar 0.7"
                  " --genotype-qualities {opts} {paired.tumor_bam}"
                  " {paired.normal_bam} | {vcfsamplediff} -s VT"
                  " {paired.normal_name} {paired.tumor_name}"
                  " - >  {tx_out_file}")
            bam.index(paired.tumor_bam, config)
            bam.index(paired.normal_bam, config)
            do.run(cl.format(**locals()), "Genotyping paired variants with FreeBayes", {})
        clean_vcf_output(out_file, _clean_freebayes_output, "nodups")

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files["dbsnp"], ref_file,
                                               config)
    return ann_file