Example #1
0
def annotate_nongatk_vcf(orig_file,
                         bam_files,
                         dbsnp_file,
                         ref_file,
                         data,
                         out_file=None):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    broad_runner = broad.runner_from_config_safe(data["config"])
    if not broad_runner or not broad_runner.has_gatk(
    ) or broad_runner.gatk_type() == "gatk4":
        if dbsnp_file:
            return add_dbsnp(orig_file, dbsnp_file, data, out_file)
        else:
            return orig_file
    else:
        if out_file is None:
            out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(
                        idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(data["config"],
                                                   include_depth=False)
                params = [
                    "-T", "VariantAnnotator", "-R", ref_file, "--variant",
                    orig_file, "--out", tx_out_file, "-L", orig_file
                ]
                if dbsnp_file:
                    params += ["--dbsnp", dbsnp_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                if ("--allow_potentially_misencoded_quality_scores"
                        not in params
                        and "-allowPotentiallyMisencodedQuals" not in params):
                    params += ["--allow_potentially_misencoded_quality_scores"]
                # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq)
                # start by removing existing -U or --unsafe opts
                # (if another option is added to Gatk that starts with -U... this may create a bug)
                unsafe_options = [
                    x for x in params if x.startswith(("-U", "--unsafe"))
                ]
                for my_opt in unsafe_options:
                    ind_to_rem = params.index(my_opt)
                    # are the options given as separate strings or in one?
                    if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe":
                        params.pop(ind_to_rem + 1)
                    params.pop(ind_to_rem)
                params.extend(["-U", "ALL"])
                broad_runner = broad.runner_from_config(data["config"])
                broad_runner.run_gatk(params)
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Example #2
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats,
                               out_handle,
                               default_flow_style=False,
                               allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Example #3
0
def variants(data, out_dir):
    """Variants QC metrics"""
    if not "variants" in data:
        return None
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    bcfstats = _run_bcftools(data, work_dir)
    bed_file = dd.get_coverage(data)
    bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt")
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    with chdir(work_dir):
        if not file_exists(bcf_out):
            with open(bcf_out, "w") as out_handle:
                yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False)
        if "vrn_file" not in data or not bed_file:
            return None

        in_vcf = data['vrn_file']
        cleaned_bed = clean_file(bed_file, data)
        if file_exists(qc_file):
            return qc_file
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(parse_file):
                with file_transaction(cg_file) as tx_out:
                    params = ["-T", "VariantAnnotator",
                              "-R", ref_file,
                              "-L", cleaned_bed,
                              "-I", in_bam,
                              "-A", "GCContent",
                              "-A", "Coverage",
                              "--variant", in_vcf,
                              "--out", tx_out]
                    broad_runner.run_gatk(params)
                cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >>out_handle, "CG\tdepth\tsample"
                    cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                            "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                            "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                remove_plus(cg_file)
Example #4
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data["vrn_file"]
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        sample = dd.get_sample_name(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T",
                        "VariantAnnotator",
                        "-R",
                        ref_file,
                        "-L",
                        bed_file,
                        "-I",
                        in_bam,
                        "-A",
                        "GCContent",
                        "-A",
                        "Coverage",
                        "--variant",
                        in_vcf,
                        "--out",
                        tx_out,
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, "w") as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}"
                    )
                    do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug("parsing coverage: %s" % sample)
        return data
Example #5
0
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data,
                         out_file=None):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    broad_runner = broad.runner_from_config_safe(data["config"])
    if not broad_runner or not broad_runner.has_gatk() or broad_runner.gatk_type() == "gatk4":
        if dbsnp_file:
            return add_dbsnp(orig_file, dbsnp_file, data, out_file)
        else:
            return orig_file
    else:
        if out_file is None:
            out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(data["config"], include_depth=False)
                params = ["-T", "VariantAnnotator",
                          "-R", ref_file,
                          "--variant", orig_file,
                          "--out", tx_out_file,
                          "-L", orig_file]
                if dbsnp_file:
                    params += ["--dbsnp", dbsnp_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                if ("--allow_potentially_misencoded_quality_scores" not in params
                      and "-allowPotentiallyMisencodedQuals" not in params):
                    params += ["--allow_potentially_misencoded_quality_scores"]
                # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq)
                # start by removing existing -U or --unsafe opts
                # (if another option is added to Gatk that starts with -U... this may create a bug)
                unsafe_options = [x for x in params if x.startswith(("-U", "--unsafe"))]
                for my_opt in unsafe_options:
                    ind_to_rem = params.index(my_opt)
                    # are the options given as separate strings or in one?
                    if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe":
                        params.pop(ind_to_rem + 1)
                    params.pop(ind_to_rem)
                params.extend(["-U", "ALL"])
                broad_runner = broad.runner_from_config(data["config"])
                broad_runner.run_gatk(params)
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Example #6
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data['vrn_file']
    sample = dd.get_sample_name(data)
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        if file_exists(qc_file):
            return data
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        bed_file, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                os.remove(cg_file)
        return data