Exemple #1
0
def remove_highdepth_regions(in_file, items):
    """Remove high depth regions from a BED file for analyzing a set of calls.

    Tries to avoid spurious errors and slow run times in collapsed repeat regions.

    Also adds ENCODE blacklist regions which capture additional collapsed repeats
    around centromeres.
    """
    from bcbio.variation import bedutils
    highdepth_beds = filter(lambda x: x is not None,
                            list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items])))
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0])
    if encode_bed and os.path.exists(encode_bed):
        highdepth_beds.append(encode_bed)
    out_file = "%s-glimit%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            with bedtools_tmpdir(items[0]):
                all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                if len(highdepth_beds) > 0:
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(highdepth_beds):
                            parts = line.split("\t")
                            out_handle.write("\t".join(parts[:4]).rstrip() + "\n")
                if utils.file_exists(all_file):
                    to_remove = bedutils.sort_merge(all_file, items[0])
                    cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}"
                    do.run(cmd.format(**locals()), "Remove high depth regions")
                else:
                    utils.symlink_plus(in_file, out_file)
    return out_file
Exemple #2
0
def _combine_sample_regions_batch(batch, items):
    """Combine sample regions within a group of batched samples.
    """
    config = items[0]["config"]
    work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "regions"))
    analysis_file = os.path.join(work_dir, "%s-analysis_blocks.bed" % batch)
    no_analysis_file = os.path.join(work_dir, "%s-noanalysis_blocks.bed" % batch)
    if not utils.file_exists(analysis_file) or _needs_region_update(analysis_file, items):
        # Combine all nblocks into a final set of intersecting regions
        # without callable bases. HT @brentp for intersection approach
        # https://groups.google.com/forum/?fromgroups#!topic/bedtools-discuss/qA9wK4zN8do
        bed_regions = [pybedtools.BedTool(x["regions"]["nblock"])
                       for x in items if "regions" in x]
        if len(bed_regions) == 0:
            analysis_file, no_analysis_file = None, None
        else:
            with file_transaction(items[0], analysis_file, no_analysis_file) as (tx_afile, tx_noafile):
                nblock_regions = reduce(operator.add, bed_regions).saveas(
                    "%s-nblock%s" % utils.splitext_plus(tx_afile))
                ref_file = tz.get_in(["reference", "fasta", "base"], items[0])
                ref_regions = get_ref_bedtool(ref_file, config)
                min_n_size = int(config["algorithm"].get("nomap_split_size", 100))
                ec_regions = _combine_excessive_coverage(items, ref_regions, min_n_size,
                                                         tx_afile)
                if len(ec_regions) > 0:
                    nblock_regions = nblock_regions.cat(ec_regions, d=min_n_size)
                block_filter = NBlockRegionPicker(ref_regions, config)
                final_nblock_regions = nblock_regions.filter(
                    block_filter.include_block).each(block_filter.expand_block).saveas(
                        "%s-nblockfinal%s" % utils.splitext_plus(tx_afile))
                final_regions = ref_regions.subtract(final_nblock_regions).merge(d=min_n_size)
                _write_bed_regions(items[0], final_regions, tx_afile, tx_noafile)
    return analysis_file, no_analysis_file
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's TNhaplotyper (MuTect2 like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper"
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else ""
            license = license_export(items[0])
            tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file)
            cores = dd.get_num_cores(items[0])
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                   "--algo TNhaplotyper "
                   "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                   "{dbsnp} {cosmic} {tx_orig_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper")
            cmd = ("gunzip -c {tx_orig_file} | "
                   "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | "
                   "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | "
                   "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | "
                   "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | "
                   "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible")
            vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])
    return out_file
Exemple #4
0
def _evaluate_multi(callers, truth_svtypes, ensemble, call_beds, data):
    out_file = "%s-validate.csv" % utils.splitext_plus(ensemble)[0]
    df_file = "%s-validate-df.csv" % utils.splitext_plus(ensemble)[0]
    if not utils.file_uptodate(out_file, ensemble) or not utils.file_uptodate(df_file, ensemble):
        with open(out_file, "w") as out_handle:
            with open(df_file, "w") as df_out_handle:
                writer = csv.writer(out_handle)
                dfwriter = csv.writer(df_out_handle)
                total_callers = callers_by_event(ensemble, data)
                writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"])
                dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"])
                for svtype, truth in truth_svtypes.items():
                    for size in EVENT_SIZES:
                        str_size = "%s-%s" % size
                        for caller in (x for x in callers if x in total_callers[svtype] or x == "sv-ensemble"):
                            try:
                                call_bed = call_beds[caller]
                            except KeyError:
                                assert caller == "sv-ensemble", caller
                                call_bed = ensemble
                            evalout = _evaluate_one(caller, svtype, size, call_bed, truth, data)
                            writer.writerow([svtype, str_size, caller,
                                             evalout["sensitivity"]["label"], evalout["precision"]["label"]])
                            for metric in ["sensitivity", "precision"]:
                                dfwriter.writerow([svtype, str_size, caller, metric,
                                                   evalout[metric]["val"], evalout[metric]["label"]])
    return out_file, df_file
Exemple #5
0
def _fix_gatk_header(exist_files, out_file, config):
    """Ensure consistent headers for VCF concatenation.

    Fixes problems for genomes that start with chrM by reheadering the first file.
    These files do haploid variant calling which lack the PID phasing key/value
    pair in FORMAT, so initial chrM samples cause errors during concatenation
    due to the lack of header merging. This fixes this by updating the first header.
    """
    from bcbio.variation import ploidy
    c, base_file = exist_files[0]
    replace_file = base_file
    items = [{"config": config}]
    if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1:
        for c, x in exist_files[1:]:
            if ploidy.get_ploidy(items, (c, 1, 2)) > 1:
                replace_file = x
                break
    base_fix_file = os.path.join(os.path.dirname(out_file),
                                 "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file)))
    with file_transaction(config, base_fix_file) as tx_out_file:
        header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0]
        do.run("zgrep ^# %s > %s"
                % (replace_file, header_file), "Prepare header file for merging")
        resources = config_utils.get_resources("picard", config)
        ropts = []
        if "options" in resources:
            ropts += [str(x) for x in resources.get("options", [])]
        do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" %
               (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)),
               "Reheader initial VCF file in merge")
    bgzip_and_index(base_fix_file, config)
    return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def add_genome_context(orig_file, data):
    """Annotate a file with annotations of genome context using vcfanno.
    """
    out_file = "%s-context.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(data, out_file) as tx_out_file:
            config_file = "%s.toml" % (utils.splitext_plus(tx_out_file)[0])
            with open(config_file, "w") as out_handle:
                all_names = []
                for fname in dd.get_genome_context_files(data):
                    bt = pybedtools.BedTool(fname)
                    if bt.field_count() >= 4:
                        d, base = os.path.split(fname)
                        _, prefix = os.path.split(d)
                        name = "%s_%s" % (prefix, utils.splitext_plus(base)[0])
                        out_handle.write("[[annotation]]\n")
                        out_handle.write('file = "%s"\n' % fname)
                        out_handle.write("columns = [4]\n")
                        out_handle.write('names = ["%s"]\n' % name)
                        out_handle.write('ops = ["uniq"]\n')
                        all_names.append(name)
                out_handle.write("[[postannotation]]\n")
                out_handle.write("fields = [%s]\n" % (", ".join(['"%s"' % n for n in all_names])))
                out_handle.write('name = "genome_context"\n')
                out_handle.write('op = "concat"\n')
                out_handle.write('type = "String"\n')
            cmd = "vcfanno {config_file} {orig_file} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Annotate with problem annotations", data)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Exemple #7
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--cut_by_quality3", "--cut_mean_quality", "5",
                    "--length_required", str(dd.get_min_read_length(data)),
                    "--disable_quality_filtering"]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemple #8
0
def _run_snpeff(snp_in, out_format, data):
    snpeff_db, datadir = get_db(data)
    assert datadir is not None, "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"]
    assert os.path.exists(os.path.join(datadir, snpeff_db)), "Did not find %s snpEff genome data in %s" % (
        snpeff_db,
        datadir,
    )
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(out_file) as tx_out_file:
            cmd = (
                "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} "
                "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Exemple #9
0
def _run_snpeff(snp_in, out_format, data):
    """Run effects prediction with snpEff, skipping if snpEff database not present.
    """
    snpeff_db, datadir = get_db(data)
    if not snpeff_db:
        return None

    assert os.path.exists(os.path.join(datadir, snpeff_db)), \
        "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir)
    snpeff_cmd = get_cmd("eff", datadir, data["config"])
    ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv"
    out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext)
    if not utils.file_exists(out_file):
        config_args = " ".join(_snpeff_args_from_config(data))
        if ext.endswith(".gz"):
            bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"])
        else:
            bgzip_cmd = ""
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} "
                   "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}")
            do.run(cmd.format(**locals()), "snpEff effects", data)
    if ext.endswith(".gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Exemple #10
0
def run_filter(vrn_file, align_bam, ref_file, data, items):
    """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants.

    Moves damage estimation to INFO field, instead of leaving in FILTER.
    """
    if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file):
        return data
    else:
        raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0]
        out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext)
                          for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]]
        if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file):
            with file_transaction(items[0], raw_file) as tx_out_file:
                # Does not apply --qcSummary plotting due to slow runtimes
                cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly",
                       "--tempFolder", os.path.dirname(tx_out_file),
                       vrn_file, align_bam, ref_file, tx_out_file]
                do.run(cmd, "Filter low frequency variants for DNA damage and strand bias")
                for out_plot in out_plot_files:
                    tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots",
                                                os.path.basename(out_plot))
                    if utils.file_exists(tx_plot_file):
                        shutil.move(tx_plot_file, out_plot)
        raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"])
        data["vrn_file"] = _filter_to_info(raw_file, items[0])
        out_plot_files = [x for x in out_plot_files if utils.file_exists(x)]
        data["damage_plots"] = out_plot_files
        return data
Exemple #11
0
def select_regions(args):
    """
    select regions and create coverage plots
    """
    assert args.files, "Need a set of fastq files"
    assert args.out, "Need --out"
    region = os.path.abspath(args.region)
    workdir = 'select'
    safe_makedir(workdir)
    out_file = os.path.join(workdir, splitext_plus(args.out)[0] + "_cpg.bed")
    out_snp_file = os.path.join(workdir, splitext_plus(args.out)[0] + '_snp.bed')
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out:
            with open(tx_out, 'w') as out_handle:
                # print >> out_handle, "chrom\tstart\tend\tcu\tcm\tstrand\tgene\tsample"
                for in_vcf in args.files:
                    snp_file = in_vcf.replace("rawcpg", "rawsnp")
                    sample = splitext_plus(os.path.basename(in_vcf))[0].split("_")[0]
                    get_het(snp_file, region, sample, out_snp_file)
                    res = pybedtools.BedTool(in_vcf).intersect(b=region, wo=True)
                    # cmd = ("bedtools intersect -u -a {in_vcf} -b {region} > {tx_tmp}")
                    # do.run(cmd.format(**locals()), "selecting %s" % in_vcf)

                    for record in res:
                        gene = record[-2]
                        chrom, pos, info, header, frmt = record[0], int(record[1]), record[7], record[8], record[9]
                        cs = info.split(';')[0].split('=')[1]
                        frmt = dict(zip(header.split(":"), frmt.split(':')))
                        if is_good_cpg(frmt):
                            tag = "%s-%s-%s-%s" % (frmt['CU'], frmt['CM'], gene, sample)
                            print >> out_handle, "%s\t%s\t%s\t%s\t.\t%s" % (chrom, pos, pos + 1, tag, cs)
Exemple #12
0
def _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, items):
    """Estimate noise of a sample using a flat background.

    Only used for panel/targeted data due to memory issues with whole genome
    samples.

    """
    if cov_interval == "genome":
        return cnns
    target_cnn = [x["file"] for x in cnns if x["cnntype"] == "target"][0]
    background_file = "%s-flatbackground.cnn" % utils.splitext_plus(target_cnn)[0]
    background_file = cnvkit_background([], background_file, items, target_bed, antitarget_bed)
    cnr_file, data = _cnvkit_fix_base(cnns, background_file, items, "-flatbackground")
    cns_file = _cnvkit_segment(cnr_file, cov_interval, data)
    metrics_file = "%s-metrics.txt" % utils.splitext_plus(target_cnn)[0]
    if not utils.file_exists(metrics_file):
        with file_transaction(data, metrics_file) as tx_metrics_file:
            cmd = [_get_cmd(), "metrics", "-o", tx_metrics_file, "-s", cns_file, "--", cnr_file]
            do.run(_prep_cmd(cmd, tx_metrics_file), "CNVkit metrics")
    metrics = _read_metrics_file(metrics_file)
    out = []
    for cnn in cnns:
        cnn["metrics"] = metrics
        out.append(cnn)
    return out
Exemple #13
0
def _get_files(data):
    work_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data),
                                              "align", dd.get_sample_name(data)))
    out_file = "%s-highdepth.bed" % os.path.join(out_dir, utils.splitext_plus(os.path.basename(work_bam))[0])
    stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0]
    return work_bam, out_file, stats_file
def umi_consensus(data):
    """Convert UMI grouped reads into fastq pair for re-alignment.
    """
    align_bam = dd.get_work_bam(data)
    umi_method, umi_tag = _check_umi_type(align_bam)
    f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0]
    f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0]
    if not utils.file_uptodate(f1_out, align_bam):
        with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out):
            jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2)
            # Improve speeds by avoiding compression read/write bottlenecks
            io_opts = "--async-io=true --compression=0"
            group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method)
            cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads"
            tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0]
            ref_file = dd.get_ref_file(data)
            cmd = ("unset JAVA_HOME && "
                   "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} "
                   "-i {align_bam} | "
                   "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: "
                   "-i /dev/stdin -o /dev/stdout | "
                   "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} "
                   "-i /dev/stdin -o /dev/stdout | "
                   "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1")
            do.run(cmd.format(**locals()), "UMI consensus fastq generation")
    return f1_out, f2_out
Exemple #15
0
def run(vcf, conf_fns, lua_fns, data, basepath=None, decomposed=False):
    """Annotate a VCF file using vcfanno (https://github.com/brentp/vcfanno)

    decomposed -- if set to true we'll convert allele based output into single values
      to match alleles and make compatible with vcf2db
      (https://github.com/quinlan-lab/vcf2db/issues/14)
    """
    conf_fns.sort(key=lambda x: os.path.basename(x) if x else "")
    lua_fns.sort(key=lambda x: os.path.basename(x) if x else "")
    ext = "-annotated-%s" % utils.splitext_plus(os.path.basename(conf_fns[0]))[0]
    if vcf.find(ext) > 0:
        out_file = vcf
    else:
        out_file = "%s%s.vcf.gz" % (utils.splitext_plus(vcf)[0], ext)
    if not utils.file_exists(out_file):
        vcfanno = config_utils.get_program("vcfanno", data)
        with file_transaction(out_file) as tx_out_file:
            conffn = _combine_files(conf_fns, out_file, data, basepath is None)
            luafn = _combine_files(lua_fns, out_file, data, False)
            luaflag = "-lua {0}".format(luafn) if luafn and utils.file_exists(luafn) else ""
            basepathflag = "-base-path {0}".format(basepath) if basepath else ""
            cores = dd.get_num_cores(data)
            post_ann = "sed -e 's/Number=A/Number=1/g' |" if decomposed else ""
            cmd = ("{vcfanno} -p {cores} {luaflag} {basepathflag} {conffn} {vcf} "
                   "| {post_ann} bgzip -c > {tx_out_file}")
            message = "Annotating {vcf} with vcfanno, using {conffn}".format(**locals())
            do.run(cmd.format(**locals()), message)
    return vcfutils.bgzip_and_index(out_file, data["config"])
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}")
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Exemple #17
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Exemple #18
0
def _shared_variant_filtration(filter_type, snp_file, ref_file, vrn_files, variantcaller):
    """Share functionality for filtering variants.
    """
    recal_file = "{base}.recal".format(base=utils.splitext_plus(snp_file)[0])
    tranches_file = "{base}.tranches".format(base=utils.splitext_plus(snp_file)[0])
    params = ["-T", "VariantRecalibrator",
              "-R", ref_file,
              "--input", snp_file,
              "--mode", filter_type,
              "-an", "DP",
              "-an", "FS",
              "-an", "ReadPosRankSum",
              "-an", "MQRankSum"]
    if filter_type in ["SNP", "BOTH"]:
        # Haplotype Score no longer calculated for indels as of GATK 2.4
        # and only used for GATK Unified Genotyper calls
        if variantcaller == "gatk":
            params.extend(["-an", "HaplotypeScore"])
        for name, train_info in [("train_hapmap", "known=false,training=true,truth=true,prior=15.0"),
                                 ("train_1000g_omni", "known=false,training=true,truth=false,prior=12.0"),
                                 ("dbsnp", "known=true,training=false,truth=false,prior=8.0")]:
            if name in vrn_files:
                params.extend(["-resource:%s,VCF,%s" % (name.replace("train_", ""), train_info),
                               vrn_files[name]])
    if filter_type in ["INDEL", "BOTH"]:
        params.extend(
            ["-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0",
             vrn_files["train_indels"]])
    return params, recal_file, tranches_file
Exemple #19
0
def _bedpe_to_vcf(bedpe_file, sconfig_file, items):
    """Convert BEDPE output into a VCF file.
    """
    tovcf_script = do.find_cmd("bedpeToVcf")
    if tovcf_script:
        out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0]
        out_nogzip = out_file.replace(".vcf.gz", ".vcf")
        raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0]
        if not utils.file_exists(out_file):
            if not utils.file_exists(raw_file):
                with file_transaction(raw_file) as tx_raw_file:
                    ref_file = tz.get_in(["reference", "fasta", "base"], items[0])
                    cmd = [
                        sys.executable,
                        tovcf_script,
                        "-c",
                        sconfig_file,
                        "-f",
                        ref_file,
                        "-b",
                        bedpe_file,
                        "-o",
                        tx_raw_file,
                    ]
                    do.run(cmd, "Convert lumpy bedpe output to VCF")
            prep_file = vcfutils.sort_by_ref(raw_file, items[0])
            if not utils.file_exists(out_nogzip):
                utils.symlink_plus(prep_file, out_nogzip)
        out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"])
        return out_file
Exemple #20
0
def _filter_by_normal(tumor_counts, normal_counts, data):
    """Filter count files based on normal frequency and median depth, avoiding high depth regions.

    For frequency, restricts normal positions to those between 0.4 and 0.65

    For depth, matches approach used in AMBER to try and avoid problematic genomic regions
    with high count in the normal:
    https://github.com/hartwigmedical/hmftools/tree/master/amber#usage
    """
    from bcbio.heterogeneity import bubbletree
    fparams = bubbletree.NORMAL_FILTER_PARAMS
    tumor_out = "%s-normfilter%s" % utils.splitext_plus(tumor_counts)
    normal_out = "%s-normfilter%s" % utils.splitext_plus(normal_counts)
    if not utils.file_uptodate(tumor_out, tumor_counts):
        with file_transaction(data, tumor_out, normal_out) as (tx_tumor_out, tx_normal_out):
            median_depth = _get_normal_median_depth(normal_counts)
            min_normal_depth = median_depth * fparams["min_depth_percent"]
            max_normal_depth = median_depth * fparams["max_depth_percent"]
            with open(tumor_counts) as tumor_handle:
                with open(normal_counts) as normal_handle:
                    with open(tx_tumor_out, "w") as tumor_out_handle:
                        with open(tx_normal_out, "w") as normal_out_handle:
                            header = None
                            for t, n in zip(tumor_handle, normal_handle):
                                if header is None:
                                    if not n.startswith("@"):
                                        header = n.strip().split()
                                    tumor_out_handle.write(t)
                                    normal_out_handle.write(n)
                                elif (_normal_passes_depth(header, n, min_normal_depth, max_normal_depth) and
                                      _normal_passes_freq(header, n, fparams)):
                                    tumor_out_handle.write(t)
                                    normal_out_handle.write(n)
    return tumor_out, normal_out
Exemple #21
0
def subset_by_supported(input_file, get_coords, calls_by_name, work_dir, data,
                        headers=("#",)):
    """Limit CNVkit input to calls with support from another caller.

    get_coords is a function that return chrom, start, end from a line of the
    input_file, allowing handling of multiple input file types.
    """
    support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name))
                     for c in ensemble.SUBSET_BY_SUPPORT["cnvkit"]]
    support_files = [(c, f) for (c, f) in support_files if f and vcfutils.vcf_has_variants(f)]
    if len(support_files) == 0:
        return input_file
    else:
        out_file = os.path.join(work_dir, "%s-havesupport%s" %
                                utils.splitext_plus(os.path.basename(input_file)))
        if not utils.file_uptodate(out_file, input_file):
            input_bed = _input_to_bed(input_file, work_dir, get_coords, headers)
            pass_coords = set([])
            with file_transaction(data, out_file) as tx_out_file:
                support_beds = " ".join([_sv_vcf_to_bed(f, c, out_file) for c, f in support_files])
                tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus(tx_out_file)[0]
                cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}"
                do.run(cmd.format(**locals()), "Intersect CNVs with support files")
                for r in pybedtools.BedTool(tmp_cmp_bed):
                    pass_coords.add((str(r.chrom), str(r.start), str(r.stop)))
                with open(input_file) as in_handle:
                    with open(tx_out_file, "w") as out_handle:
                        for line in in_handle:
                            passes = True
                            if not line.startswith(headers):
                                passes = get_coords(line) in pass_coords
                            if passes:
                                out_handle.write(line)
        return out_file
Exemple #22
0
def _mirtop(input_fn, sps, db, out_dir, config):
    """
    Convert to GFF3 standard format
    """
    hairpin = os.path.join(db, "hairpin.fa")
    gtf = os.path.join(db, "mirbase.gff3")
    if not file_exists(hairpin) or not file_exists(gtf):
        logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf))
        return None
    out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0]
    out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0]
    export = _get_env()
    cmd = ("{export} mirtop gff  --sps {sps} --hairpin {hairpin} "
           "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}")
    if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \
       not file_exists(os.path.join(out_dir, out_gff_fn)):
        with tx_tmpdir() as out_tx:
            do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn)
            with utils.chdir(out_tx):
                out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \
                                    else out_gff_fn
                if utils.file_exists(out_fn):
                    shutil.move(os.path.join(out_tx, out_fn),
                                os.path.join(out_dir, out_fn))
    out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \
                        else os.path.join(out_dir, out_gff_fn)
    if utils.file_exists(os.path.join(out_dir, out_fn)):
        return os.path.join(out_dir, out_fn)
Exemple #23
0
def _cram_to_fastq_regions(regions, cram_file, dirs, data):
    """Convert CRAM files to fastq, potentially within sub regions.

    Returns multiple fastq files that can be merged back together.
    """
    base_name = utils.splitext_plus(os.path.basename(cram_file))[0]
    work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep",
                                               "%s-parts" % base_name))
    ref_file = tz.get_in(["reference", "fasta", "base"], data)
    resources = config_utils.get_resources("bamtofastq", data["config"])
    cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1)
    max_mem = int(resources.get("memory", "1073741824")) * cores  # 1Gb/core default
    fnames = []
    is_paired = False
    for region in regions:
        rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full"
        out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" %
                                              (base_name, rext, fext))
                                 for fext in ["s1", "p1", "p2"]]
        if not utils.file_exists(out_p1):
            with file_transaction(out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2):
                sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0]
                cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} "
                       "gz=1 collate=1 colsbs={max_mem} "
                       "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null "
                       "reference={ref_file}")
                if region:
                    cmd += " ranges='{region}'"
                do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "")
        if is_paired or not _is_gzip_empty(out_p1):
            fnames.append((out_p1, out_p2))
            is_paired = True
        else:
            fnames.append((out_s,))
    return fnames
def _prioritize_vcf(caller, vcf_file, prioritize_by, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = "bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}"
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                cmd = "simple_sv_annotation.py -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(simple_vcf, data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = (
                "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                """'{{if (($7 == "PASS" || $7 == ".")) """
                "print CALLER,SNAME,$1,$2,I$END,I$SVTYPE,I$KNOWN,I$LOF,I$SIMPLE_ANN,"
                "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}"
            )
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True,
              rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None):
    """Normalizes variants and reruns SnpEFF for resulting VCF
    """
    if remove_oldeffects:
        out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file)
    else:
        out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if work_dir:
        out_file = os.path.join(work_dir, os.path.basename(out_file))
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _normalize(in_file, data, passonly=passonly,
                                       normalize_indels=normalize_indels,
                                       split_biallelic=split_biallelic,
                                       remove_oldeffects=remove_oldeffects,
                                       nonrefonly=nonrefonly,
                                       work_dir=work_dir)
            if rerun_effects:
                ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
                if ann_ma_file:
                    ready_ma_file = ann_ma_file
            utils.symlink_plus(ready_ma_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Exemple #26
0
def _run_svtyper(in_file, full_bam, exclude_file, data):
    """Genotype structural variant calls with SVtyper.

    Removes calls in high depth regions to avoid slow runtimes:
    https://github.com/hall-lab/svtyper/issues/16
    """
    out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vcfutils.vcf_has_variants(in_file):
                shutil.copy(in_file, out_file)
            else:
                python = sys.executable
                svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper")
                if exclude_file and utils.file_exists(exclude_file):
                    regions_to_rm = "-T ^%s" % (exclude_file)
                else:
                    regions_to_rm = ""
                # add FILTER headers, which are lost during svtyping
                header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0]
                with open(header_file, "w") as out_handle:
                    with utils.open_gzipsafe(in_file) as in_handle:
                        for line in in_handle:
                            if not line.startswith("#"):
                                break
                            if line.startswith("##FILTER"):
                                out_handle.write(line)
                    for region in ref.file_contigs(dd.get_ref_file(data), data["config"]):
                        out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size))
                cmd = ("bcftools view {in_file} {regions_to_rm} | "
                       "{python} {svtyper} --max_reads 1000 -B {full_bam} | "
                       "bcftools annotate -h {header_file} | "
                       "bgzip -c > {tx_out_file}")
                do.run(cmd.format(**locals()), "SV genotyping with svtyper")
    return vcfutils.sort_by_ref(out_file, data)
Exemple #27
0
def combine_pairs(input_files):
    """ calls files pairs if they are completely the same except
    for one has _1 and the other has _2 returns a list of tuples
    of pairs or singles.
    From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py)
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"])

    pairs = []
    used = set([])
    for in_file in input_files:
        if in_file in used:
            continue
        for comp_file in input_files:
            if comp_file in used or comp_file == in_file:
                continue
            a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0])
            b = rstrip_extra(utils.splitext_plus(os.path.basename(comp_file))[0])
            if len(a) != len(b):
                continue
            s = dif(a,b)
            # no differences, then its the same file stem
            if len(s) == 0:
                logger.error("%s and %s have the same stem, so we don't know "
                             "how to assign it to the sample data in the CSV. To "
                             "get around this you can rename one of the files. "
                             "If they are meant to be the same sample run in two "
                             "lanes, combine them first with the "
                             "bcbio_prepare_samples.py script."
                             "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)"
                             % (in_file, comp_file))
                sys.exit(1)
            if len(s) > 1:
                continue #there is only 1 difference
            if (a[s[0]] in PAIR_FILE_IDENTIFIERS and
                  b[s[0]] in PAIR_FILE_IDENTIFIERS):
                # if the 1/2 isn't the last digit before a separator, skip
                # this skips stuff like 2P 2A, often denoting replicates, not
                # read pairings
                if len(b) > (s[0] + 1):
                    if (b[s[0]+1] not in ("_", "-", ".")):
                        continue
                # if the 1/2 is not a separator or prefaced with R, skip
                if b[s[0]- 1] in ("R", "_", "-", "."):
                    used.add(in_file)
                    used.add(comp_file)
                    if b[s[0]] > a[s[0]]:
                        pairs.append([in_file, comp_file])
                    else:
                        pairs.append([comp_file, in_file])
                    break
        if in_file not in used:
            pairs.append([in_file])
            used.add(in_file)
    return pairs
Exemple #28
0
def _collapse(in_file):
    cmd = "seqcluster collapse -f {in_file} -o {out_dir}"
    basename = splitext_plus(op.basename(in_file))[0]
    out_file = splitext_plus(in_file)[0] + "_trimmed.fastq"
    if not utils.file_exists(out_file):
        with tx_tmpdir() as out_dir:
            tx_out_file = op.join(out_dir, basename + "_trimmed.fastq")
            do.run(cmd.format(**locals()), "collapse")
            shutil.move(tx_out_file, out_file)
    return out_file
Exemple #29
0
def _gene_closest(orig_bed, gene_gtf):
    """Calculate the closest transcript to events in the input BED file.
    """
    sorted_gtf = "%s-sort.gtf" % utils.splitext_plus(gene_gtf)[0]
    if not utils.file_exists(sorted_gtf):
        cmd = ("zcat {gene_gtf} | grep -v ^# "
               "| sort -k1,1 -k4,4n -k 5,5n > {sorted_gtf}")
        do.run(cmd.format(**locals()), "Sort input GTF file")
    out_file = "%s-ann%s" % utils.splitext_plus(orig_bed)
    cmd = "bedtools closest -d -t first -a {orig_bed} -b {sorted_gtf} > {out_file}"
    do.run(cmd.format(**locals()), "Identify closest gene")
Exemple #30
0
def _combine_excessive_coverage(samples, ref_regions, min_n_size, tmp_outfile):
    """Provide a global set of regions with excessive coverage to avoid.
    """
    flag = "EXCESSIVE_COVERAGE"
    ecs = (pybedtools.BedTool(x["regions"]["callable"]).filter(lambda x: x.name == flag)
           for x in samples if "regions" in x)
    merge_ecs = _combine_regions(ecs, ref_regions).saveas("%s-ecmergeorig%s" % utils.splitext_plus(tmp_outfile))
    if len(merge_ecs) > 0:
        return merge_ecs.merge(d=min_n_size).filter(lambda x: x.stop - x.start > min_n_size).saveas(
            "%s-ecmerge%s" % utils.splitext_plus(tmp_outfile))
    else:
        return merge_ecs
Exemple #31
0
def _flatten_samples(samples, base_file):
    """Create a flattened JSON representation of data from the bcbio world map.
    """
    out_file = "%s-samples.json" % utils.splitext_plus(base_file)[0]
    flat_data = []
    for data in samples:
        cur_flat = {}
        for key_path in [["analysis"], ["description"], ["rgnames"], ["config", "algorithm"],
                         ["metadata"], ["genome_build"],
                         ["files"], ["reference"], ["genome_resources"], ["vrn_file"]]:
            cur_key = "__".join(key_path)
            for flat_key, flat_val in _to_cwldata(cur_key, tz.get_in(key_path, data)):
                cur_flat[flat_key] = flat_val
        flat_data.append(cur_flat)
    out = {}
    for key in sorted(list(set(reduce(operator.add, [d.keys() for d in flat_data])))):
        out[key] = []
        for cur_flat in flat_data:
            out[key].append(cur_flat.get(key))
    with open(out_file, "w") as out_handle:
        json.dump(out, out_handle, sort_keys=True, indent=4, separators=(',', ': '))
        return out_file, _samplejson_to_inputs(out)
Exemple #32
0
def find_annotations(data):
    """Find annotation configuration files for vcfanno, using pre-installed inputs.

    Creates absolute paths for user specified inputs and finds locally
    installed defaults.

    Default annotations:
      - gemini for variant pipelines
      - somatic for variant tumor pipelines
      - rnaedit for RNA-seq variant calling
    """
    conf_files = dd.get_vcfanno(data)
    if not isinstance(conf_files, (list, tuple)):
        conf_files = [conf_files]
    for c in _default_conf_files(data):
        if c not in conf_files:
            conf_files.append(c)
    out = []
    annodir = os.path.normpath(
        os.path.abspath(
            os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir,
                         "config", "vcfanno")))
    for conf_file in conf_files:
        if utils.file_exists(conf_file) and os.path.isfile(conf_file):
            conffn = conf_file
        else:
            conffn = os.path.join(annodir, conf_file + ".conf")
        if not utils.file_exists(conffn):
            build = dd.get_genome_build(data)
            CONF_NOT_FOUND = (
                "The vcfanno configuration {conffn} was not found for {build}, skipping."
            )
            logger.warn(CONF_NOT_FOUND.format(**locals()))
        else:
            out.append(conffn)
            luafn = "%s.lua" % utils.splitext_plus(conffn)[0]
            if os.path.exists(luafn):
                out.append(luafn)
    return out
Exemple #33
0
def _regions_for_coverage(data, region, ref_file, out_file):
    """Retrieve BED file of regions we need to calculate coverage in.
    """
    variant_regions = bedutils.merge_overlaps(utils.get_in(data, ("config", "algorithm", "variant_regions")),
                                              data)
    ready_region = shared.subset_variant_regions(variant_regions, region, out_file)
    custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0]
    if not ready_region:
        get_ref_bedtool(ref_file, data["config"]).saveas(custom_file)
        return custom_file, True
    elif os.path.isfile(ready_region):
        return ready_region, True
    elif isinstance(ready_region, (list, tuple)):
        c, s, e = ready_region
        pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file)
        return custom_file, True
    else:
        with file_transaction(data, custom_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                for feat in get_ref_bedtool(ref_file, data["config"], region):
                    out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE"))
        return custom_file, variant_regions is None
Exemple #34
0
def pon_to_bed(pon_file, out_dir, data):
    """Extract BED intervals from a GATK4 hdf5 panel of normal file.
    """
    out_file = os.path.join(
        out_dir, "%s-intervals.bed" %
        (utils.splitext_plus(os.path.basename(pon_file))[0]))
    if not utils.file_uptodate(out_file, pon_file):
        import h5py
        with file_transaction(data, out_file) as tx_out_file:
            with h5py.File(pon_file, "r") as f:
                with open(tx_out_file, "w") as out_handle:
                    intervals = f["original_data"]["intervals"]
                    for i in range(
                            len(intervals["transposed_index_start_end"][0])):
                        chrom = intervals["indexed_contig_names"][
                            intervals["transposed_index_start_end"][0][i]]
                        start = int(
                            intervals["transposed_index_start_end"][1][i]) - 1
                        end = int(
                            intervals["transposed_index_start_end"][2][i])
                        out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None):
    if not gemini_db:
        gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0]
    if not utils.file_exists(gemini_db):
        if not vcfutils.vcf_has_variants(gemini_vcf):
            return None
        with file_transaction(data, gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            load_opts = ""
            if "gemini_allvariants" not in dd.get_tools_on(data):
                load_opts += " --passonly"
            # For small test files, skip gene table loading which takes a long time
            if _is_small_vcf(gemini_vcf):
                load_opts += " --skip-gene-tables"
            if "/test_automated_output/" in gemini_vcf:
                load_opts += " --test-mode"
            # Skip CADD or gerp-bp if neither are loaded
            gemini_dir = install.get_gemini_dir(data)
            for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]:
                if not os.path.exists(os.path.join(gemini_dir, check_file)):
                    load_opts += " %s" % skip_cmd
            # skip gerp-bp which slows down loading
            load_opts += " --skip-gerp-bp "
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            tmpdir = os.path.dirname(tx_gemini_db)
            eanns = _get_effects_flag(data)
            # Apply custom resource specifications, allowing use of alternative annotation_dir
            resources = config_utils.get_resources("gemini", data["config"])
            gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else ""
            exports = utils.local_path_export()
            cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} "
                   "-v {gemini_vcf} {eanns} --cores {num_cores} "
                   "--tempdir {tmpdir} {tx_gemini_db}")
            cmd = cmd.format(**locals())
            do.run(cmd, "Create gemini database for %s" % gemini_vcf, data)
            if ped_file:
                cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db]
                do.run(cmd, "Add PED file to gemini database", data)
    return gemini_db
Exemple #36
0
def prep_seq2c_bed(data):
    """Selecting the bed file, cleaning, and properly annotating for Seq2C
    """
    if dd.get_background_cnv_reference(data, "seq2c"):
        bed_file = _background_to_bed(
            dd.get_background_cnv_reference(data, "seq2c"), data)
    else:
        bed_file = regions.get_sv_bed(data)
    if bed_file:
        bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-")
    else:
        bed_file = bedutils.clean_file(dd.get_variant_regions(data), data)
    if not bed_file:
        return None

    col_num = bt.BedTool(bed_file).field_count()
    if col_num < 4:
        annotated_file = annotate.add_genes(bed_file, data, max_distance=0)
        if annotated_file == bed_file:
            raise ValueError(
                "BED file for Seq2C must be annotated with gene names, "
                "however the input BED is 3-columns and we have no transcript "
                "data to annotate with " + bed_file)
        annotated_file = annotate.gene_one_per_line(annotated_file, data)
    else:
        annotated_file = bed_file

    ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0])
    if not utils.file_uptodate(ready_file, annotated_file):
        bed = bt.BedTool(annotated_file)
        if col_num > 4 and col_num != 8:
            bed = bed.cut(range(4))
        bed = bed.filter(lambda x: x.name not in ["", ".", "-"])
        with file_transaction(data, ready_file) as tx_out_file:
            bed.saveas(tx_out_file)
        logger.debug("Saved Seq2C clean annotated ready input BED into " +
                     ready_file)

    return ready_file
Exemple #37
0
def get_refs(genome_build, aligner, galaxy_base):
    """Retrieve the reference genome file location from galaxy configuration.
    """
    out = {}
    name_remap = {"samtools": "fasta"}
    if genome_build:
        galaxy_config = _get_galaxy_tool_info(galaxy_base)
        for name in [x for x in (aligner, "samtools") if x]:
            galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"])
            loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"],
                                                        galaxy_base)
            cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap,
                                               galaxy_config)
            base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"]))
            if os.path.isdir(base):
                indexes = glob.glob(os.path.join(base, "*"))
            else:
                indexes = glob.glob("%s*" % utils.splitext_plus(base)[0])
            out[name_remap.get(name, name)] = {"indexes": indexes}
            if os.path.exists(base) and os.path.isfile(base):
                out[name_remap.get(name, name)]["base"] = base
    return out
Exemple #38
0
def prepare_exclude_file(items, base_file, chrom=None):
    """Prepare a BED file for exclusion, incorporating variant regions and chromosome.

    Excludes locally repetitive regions (if `remove_lcr` is set) and
    centromere regions, both of which contribute to long run times and
    false positive structural variant calls.
    """
    out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0],
                                     "-%s" % chrom if chrom else "")
    if not utils.file_exists(out_file) and not utils.file_exists(out_file +
                                                                 ".gz"):
        with shared.bedtools_tmpdir(items[0]):
            # Get a bedtool for the full region if no variant regions
            want_bedtool = callable.get_ref_bedtool(
                tz.get_in(["reference", "fasta", "base"], items[0]),
                items[0]["config"], chrom)
            if chrom:
                want_bedtool = pybedtools.BedTool(
                    shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom,
                                               items[0]))
            lcr_bed = shared.get_lcr_bed(items)
            if lcr_bed:
                want_bedtool = want_bedtool.subtract(
                    pybedtools.BedTool(lcr_bed))
            sv_exclude_bed = _get_sv_exclude_file(items)
            if sv_exclude_bed and len(want_bedtool) > 0:
                want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas()
            want_bedtool = pybedtools.BedTool(
                shared.remove_highdepth_regions(want_bedtool.saveas().fn,
                                                items))
            with file_transaction(items[0], out_file) as tx_out_file:
                full_bedtool = callable.get_ref_bedtool(
                    tz.get_in(["reference", "fasta", "base"], items[0]),
                    items[0]["config"])
                if len(want_bedtool) > 0:
                    full_bedtool.subtract(want_bedtool).saveas(tx_out_file)
                else:
                    full_bedtool.saveas(tx_out_file)
    return out_file
Exemple #39
0
def _enforce_max_region_size(in_file, data):
    """Ensure we don't have any chunks in the region greater than 1Mb.

    Larger sections have high memory usage on VarDictJava and failures
    on VarDict. This creates minimum windows from the input BED file
    to avoid these issues. Downstream VarDict merging sorts out any
    variants across windows.
    """
    max_size = 1e6
    overlap_size = 250
    def _has_larger_regions(f):
        return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f))
    out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if _has_larger_regions(in_file):
            with file_transaction(data, out_file) as tx_out_file:
                pybedtools.BedTool().window_maker(w=max_size,
                                                  s=max_size - overlap_size,
                                                  b=pybedtools.BedTool(in_file)).saveas(tx_out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Exemple #40
0
def unified_genotyper(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Perform SNP genotyping on the given alignment file.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        broad_runner, params = \
            _shared_gatk_call_prep(align_bams, items,
                                   ref_file, assoc_files.get("dbsnp"),
                                   region, out_file)
        with file_transaction(items[0], out_file) as tx_out_file:
            params += ["-T", "UnifiedGenotyper",
                       "-o", tx_out_file,
                       "-ploidy", (str(ploidy.get_ploidy(items, region))
                                   if broad_runner.gatk_type() == "restricted" else "2"),
                       "--genotype_likelihoods_model", "BOTH"]
            resources = config_utils.get_resources("gatk", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            broad_runner.run_gatk(params)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Exemple #41
0
def _prep_items_from_base(base, in_files, force_single=False):
    """Prepare a set of configuration items for input files.
    """
    details = []
    in_files = _expand_dirs(in_files, KNOWN_EXTS)
    in_files = _expand_wildcards(in_files)

    for i, (ext, files) in enumerate(
            itertools.groupby(
                in_files,
                lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower()))):
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, i, base))
        elif ext in ["fastq", "fq", "fasta"]:
            files = list(files)
            for fs in fastq.combine_pairs(files, force_single):
                details.append(_prep_fastq_input(fs, base))
        else:
            print("Ignoring unexpected input file types %s: %s" %
                  (ext, list(files)))
    return details
Exemple #42
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file,
                                           data["config"],
                                           out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base = utils.splitext_plus(os.path.basename(vrn_file))[0]
        sample_file = os.path.join(
            base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data)))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data),
                                          sample_file, data["config"])
    # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them
    else:
        vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                            data["config"],
                                            out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir,
                                         data)
    return vrn_file, rm_file, interval_bed
Exemple #43
0
def _extract_germline(in_file, data):
    """Extract germline calls non-somatic, non-filtered calls.
    """
    out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate(
            out_file + ".gz", in_file):
        with file_transaction(data, out_file) as tx_out_file:
            reader = cyvcf2.VCF(str(in_file))
            reader.add_filter_to_header({
                'ID':
                'Somatic',
                'Description':
                'Variant called as Somatic'
            })
            #with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write(reader.raw_header)
                for rec in reader:
                    rec = _update_germline_filters(rec)
                    out_handle.write(str(rec))
                    #writer.write_record(rec)
    return out_file
Exemple #44
0
def unpack_tarballs(xs, data, use_subdir=True):
    """Unpack workflow tarballs into ready to use directories.
    """
    if isinstance(xs, dict):
        for k, v in xs.items():
            xs[k] = unpack_tarballs(v, data, use_subdir)
    elif isinstance(xs, (list, tuple)):
        xs = [unpack_tarballs(x, data, use_subdir) for x in xs]
    elif isinstance(xs, basestring):
        if os.path.isfile(xs.encode("utf-8",
                                    "ignore")) and xs.endswith("-wf.tar.gz"):
            if use_subdir:
                tarball_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "wf-inputs"))
            else:
                tarball_dir = dd.get_work_dir(data)
            out_dir = os.path.join(
                tarball_dir,
                os.path.basename(xs).replace("-wf.tar.gz",
                                             "").replace("--", os.path.sep))
            if not os.path.exists(out_dir):
                with utils.chdir(tarball_dir):
                    with tarfile.open(xs, "r:gz") as tar:
                        tar.extractall()
            assert os.path.exists(out_dir), out_dir
            # Default to representing output directory
            xs = out_dir
            # Look for aligner indices
            for fname in os.listdir(out_dir):
                if fname.endswith(DIR_TARGETS):
                    xs = os.path.join(out_dir, fname)
                    break
                elif fname.endswith(BASENAME_TARGETS):
                    base = os.path.join(
                        out_dir,
                        utils.splitext_plus(os.path.basename(fname))[0])
                    xs = glob.glob("%s*" % base)
                    break
    return xs
Exemple #45
0
def _merge_and_bgzip(orig_files, out_file, base_file, ext=""):
    """Merge a group of gzipped input files into a final bgzipped output.

    Also handles providing unique names for each input file to avoid
    collisions on multi-region output. Handles renaming with awk magic from:
    https://www.biostars.org/p/68477/
    """
    assert out_file.endswith(".gz")
    full_file = out_file.replace(".gz", "")
    run_file = "%s-merge.bash" % utils.splitext_plus(base_file)[0]

    cmds = ["set -e\n"]
    for i, fname in enumerate(orig_files):
        cmd = ("""zcat %s | awk '{print (NR%%4 == 1) ? "@%s_" ++i "%s" : $0}' >> %s\n"""
               % (fname, i, ext, full_file))
        cmds.append(cmd)
    cmds.append("bgzip -f %s\n" % full_file)

    with open(run_file, "w") as out_handle:
        out_handle.write("".join("".join(cmds)))
    do.run([do.find_bash(), run_file], "Rename, merge and bgzip CRAM fastq output")
    assert os.path.exists(out_file) and not _is_gzip_empty(out_file)
Exemple #46
0
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Prepare input VCF and BED files for validation.
    """
    if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
        rm_file = vcfutils.bgzip_and_index(rm_file,
                                           data["config"],
                                           out_dir=base_dir)
    if len(vcfutils.get_samples(vrn_file)) > 1:
        base, ext = utils.splitext_plus(os.path.basename(vrn_file))
        sample_file = os.path.join(
            base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
        vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data),
                                          sample_file, data["config"])
    if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file +
                                                              ".tbi"):
        vrn_file = vcfutils.bgzip_and_index(vrn_file,
                                            data["config"],
                                            out_dir=base_dir)

    interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir,
                                         data)
    return vrn_file, rm_file, interval_bed
Exemple #47
0
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file, data):
    """Retrieve summary of coverage in a region.
    Requires positive non-zero mapping quality at a position, matching GATK's
    CallableLoci defaults.
    """
    out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0]
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            fai_file = ref.fasta_idx(ref_file, data["config"])
            sambamba = config_utils.get_program("sambamba", data["config"])
            bedtools = config_utils.get_program("bedtools", data["config"])
            cmd = ("{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | "
                   "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} "
                   "> {tx_out_file}")
            do.run(cmd.format(**locals()), "bedtools genomecov: %s" % (str(region)), data)
    # Empty output file, no coverage for the whole contig
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                for feat in get_ref_bedtool(ref_file, data["config"], region):
                    out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, 0))
    return out_file
Exemple #48
0
def _callable_from_gvcf(data, vrn_file, out_dir):
    """Retrieve callable regions based on ref call regions in gVCF.

    Uses https://github.com/lijiayong/gvcf_regions
    """
    methods = {
        "freebayes": "freebayes",
        "platypus": "platypus",
        "gatk-haplotype": "gatk"
    }
    gvcf_type = methods.get(dd.get_variantcaller(data))
    if gvcf_type:
        out_file = os.path.join(
            out_dir, "%s-gcvf-coverage.bed" %
            utils.splitext_plus(os.path.basename(vrn_file))[0])
        if not utils.file_uptodate(out_file, vrn_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} "
                       "| bedtools merge > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Convert gVCF to BED file of callable regions")
        return out_file
Exemple #49
0
def merge_overlaps(in_file, data):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    if in_file:
        bedtools = config_utils.get_program("bedtools", data["config"])
        work_dir = tz.get_in(["dirs", "work"], data)
        if work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(
            bedprep_dir, "%s-merged.bed" %
            (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                cmd = "{bedtools} merge -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Exemple #50
0
def get_region_bed(region, items, out_file, want_gzip=True):
    """Retrieve BED file of regions to analyze, either single or multi-region.
    """
    variant_regions = bedutils.merge_overlaps(
        bedutils.population_variant_regions(items), items[0])
    target = shared.subset_variant_regions(variant_regions, region, out_file,
                                           items)
    if not target:
        raise ValueError("Need BED input for strelka2 regions: %s %s" %
                         (region, target))
    if not isinstance(target, basestring) or not os.path.isfile(target):
        chrom, start, end = target
        target = "%s-regions.bed" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], target) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                out_handle.write("%s\t%s\t%s\n" % (chrom, start, end))
    out_file = bedutils.merge_overlaps(target,
                                       items[0],
                                       out_dir=os.path.dirname(out_file))
    if want_gzip:
        out_file += ".gz"
    return out_file
Exemple #51
0
def summarize(calls, data):
    """Summarize results from multiple callers into a single flattened BED file.
    """
    sample = tz.get_in(["rgnames", "sample"], data)
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural",
                                               sample, "ensemble"))
    out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            with shared.bedtools_tmpdir(data):
                input_beds = filter(lambda x: x is not None,
                                    [_create_bed(c, out_file) for c in calls])
                if len(input_beds) > 0:
                    all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(all_file, "w") as out_handle:
                        for line in fileinput.input(input_beds):
                            out_handle.write(line)
                    pybedtools.BedTool(all_file).sort(stream=True).merge(nms=True).saveas(tx_out_file)
    if utils.file_exists(out_file):
        calls.append({"variantcaller": "ensemble",
                      "vrn_file": out_file})
    return calls
Exemple #52
0
def _prep_cnv_file(in_file, svcaller, work_dir, data):
    """Create a CSV file of CNV calls with log2 and number of marks.
    """
    out_file = os.path.join(
        work_dir, "%s-%s-prep.csv" %
        (utils.splitext_plus(os.path.basename(in_file))[0], svcaller))
    autosomal_chroms = _get_autosomal_chroms()
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(in_file) as in_handle:
                with open(tx_out_file, "w") as out_handle:
                    reader = csv.reader(in_handle, dialect="excel-tab")
                    writer = csv.writer(out_handle)
                    writer.writerow(
                        ["chrom", "start", "end", "num.mark", "seg.mean"])
                    reader.next()  # header
                    for chrom, start, end, _, log2, probes in reader:
                        if chrom in autosomal_chroms:
                            writer.writerow([
                                _to_ucsc_style(chrom), start, end, probes, log2
                            ])
    return out_file
def _cnn_tranch_filtering(in_file, vrn_files, tensor_type, data):
    """Filter CNN scored VCFs in tranches using standard SNP and Indel truth sets.
    """
    out_file = "%s-filter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        runner = broad.runner_from_config(data["config"])
        gatk_type = runner.gatk_type()
        assert gatk_type == "gatk4", "CNN filtering requires GATK4"
        if "train_hapmap" not in vrn_files:
            raise ValueError("CNN filtering requires HapMap training inputs: %s" % vrn_files)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "FilterVariantTranches", "--variant", in_file,
                      "--output", tx_out_file,
                      "--snp-truth-vcf", vrn_files["train_hapmap"],
                      "--indel-truth-vcf", vrn_files["train_indels"]]
            if tensor_type == "reference":
                params += ["--info-key", "CNN_1D", "--tranche", "99"]
            else:
                assert tensor_type == "read_tensor"
                params += ["--info-key", "CNN_2D", "--tranche", "99"]
            runner.run_gatk(params)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Exemple #54
0
def _freebayes_cutoff(in_file, data):
    """Perform filtering of FreeBayes results, flagging low confidence calls.

    Filters using cutoffs on low depth based on Meynert et al's work modeling sensitivity
    of homozygote and heterozygote calling on depth:

    http://www.ncbi.nlm.nih.gov/pubmed/23773188

    and high depth heterozygote SNP filtering based on Heng Li's work
    evaluating variant calling artifacts:

    http://arxiv.org/abs/1404.0929

    Tuned based on NA12878 call comparisons to Genome in a Bottle reference genome.
    """
    if not vcfutils.vcf_has_variants(in_file):
        base, ext = utils.splitext_plus(in_file)
        out_file = "{base}-filter{ext}".format(**locals())
        if not utils.file_exists(out_file):
            shutil.copy(in_file, out_file)
        if out_file.endswith(".vcf.gz"):
            out_file = vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file

    depth_thresh, qual_thresh = None, None
    if _do_high_depth_filter(data):
        stats = _calc_vcf_stats(in_file)
        if stats["avg_depth"] > 0:
            depth_thresh = int(
                math.ceil(stats["avg_depth"] +
                          3 * math.pow(stats["avg_depth"], 0.5)))
            qual_thresh = depth_thresh * 2.0  # Multiplier from default GATK QD cutoff filter
    filters = (
        '(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || '
        '(AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))')
    if depth_thresh:
        filters += ' || (%QUAL < {qual_thresh} && max(FORMAT/DP) > {depth_thresh} && AF[0] <= 0.5)'.format(
            **locals())
    return cutoff_w_expression(in_file, filters, data, name="FBQualDepth")
Exemple #55
0
def _filter_paired(tumor, normal, out_file, reference, data):
    """filter paired vcf file with GATK
    :param    tumor: (str) sample name for tumor
    :param    normal: (str) sample name for normal
    :param    out_file: (str) final vcf file
    :param    reference: (str) genome in fasta format
    :param    data: (dict) information from yaml file(items[0])
    :returns: (str) name of final vcf file
    """
    in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf"
    shutil.move(out_file, in_file)
    config = data["config"]
    with file_transaction(data, out_file) as tx_out_file:
        params = [
            "-T", "SomaticPindelFilter", "-V", in_file, "-o", tx_out_file,
            "-TID", tumor, "-NID", normal, "-R", reference
        ]
        jvm_opts = broad.get_gatk_framework_opts(config)
        cmd = [config_utils.get_program("gatk-framework", config)
               ] + jvm_opts + params
        do.run(cmd, "Filter pindel variants")
    return out_file
Exemple #56
0
def hard_w_expression(vcf_file, expression, data, name="+", filterext="",
                      extra_cmd=""):
    """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                variant_regions = utils.get_in(data, ("config", "algorithm", "variant_regions"))
                intervals = ("-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"])
                             if variant_regions else "")
                cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                       "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()), "Hard filtering %s with %s" % (vcf_file, expression), data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Exemple #57
0
def _setup_variant_regions(data, out_dir):
    """Ensure we have variant regions for calling, using transcript if not present.

    Respects noalt_calling by removing additional contigs to improve
    speeds.
    """
    vr_file = dd.get_variant_regions(data)
    if not vr_file:
        vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir)
    contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))])
    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")),
                            "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0])
    if not utils.file_uptodate(out_file, vr_file):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with shared.bedtools_tmpdir(data):
                    for r in pybedtools.BedTool(vr_file):
                        if r.chrom in contigs:
                            if chromhacks.is_nonalt(r.chrom):
                                out_handle.write(str(r))
    data = dd.set_variant_regions(data, out_file)
    return data
Exemple #58
0
def fix_somatic_calls(in_file, config):
    """Fix somatic variant output, standardize it to the SOMATIC flag.
    """
    if vcf is None:
        raise ImportError("Require PyVCF for manipulating cancer VCFs")

    # HACK: Needed to replicate the structure used by PyVCF
    Info = namedtuple('Info', ['id', 'num', 'type', 'desc'])
    somatic_info = Info(id='SOMATIC', num=0, type='Flag', desc='Somatic event')

    # NOTE: PyVCF will write an uncompressed VCF
    base, ext = utils.splitext_plus(in_file)
    name = "somaticfix"
    out_file = "{0}-{1}{2}".format(base, name, ".vcf")

    if utils.file_exists(in_file):
        reader = vcf.VCFReader(filename=in_file)
        # Add info to the header of the reader
        reader.infos["SOMATIC"] = somatic_info

        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "wb") as handle:
                writer = vcf.VCFWriter(handle, template=reader)
                for record in reader:
                    # Handle FreeBayes
                    if "VT" in record.INFO:
                        if record.INFO["VT"] == "somatic":
                            record.add_info("SOMATIC", True)
                        # Discard old record
                        del record.INFO["VT"]

                    writer.write_record(record)

        # Re-compress the file
        out_file = bgzip_and_index(out_file, config)
        _move_vcf(in_file, "{0}.orig".format(in_file))
        _move_vcf(out_file, in_file)
        with open(out_file, "w") as out_handle:
            out_handle.write("Moved to {0}".format(in_file))
Exemple #59
0
def dedup_bam(in_bam, data):
    """Perform non-stream based deduplication of BAM input files using biobambam.
    """
    if _check_dedup(data):
        out_file = "%s-dedup%s" % utils.splitext_plus(in_bam)
        if not utils.file_exists(out_file):
            with tx_tmpdir(data) as tmpdir:
                with file_transaction(data, out_file) as tx_out_file:
                    bammarkduplicates = config_utils.get_program(
                        "bammarkduplicates", data["config"])
                    base_tmp = os.path.join(
                        tmpdir,
                        os.path.splitext(os.path.basename(tx_out_file))[0])
                    cores, mem = _get_cores_memory(data, downscale=3)
                    cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup "
                           "markthreads={cores} I={in_bam} O={tx_out_file}")
                    do.run(cmd.format(**locals()),
                           "De-duplication with biobambam")
        bam.index(out_file, data["config"])
        return out_file
    else:
        return in_bam
Exemple #60
0
def _prioritize_plot_regions(region_bt, data, out_dir=None):
    """Avoid plotting large numbers of regions due to speed issues. Prioritize most interesting.

    XXX For now, just removes larger regions and avoid plotting thousands of regions.
    Longer term we'll insert biology-based prioritization.
    """
    max_plots = 1000
    max_size = 100 * 1000  # 100kb
    out_file = "%s-priority%s" % utils.splitext_plus(region_bt.fn)
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    num_plots = 0
    if not utils.file_uptodate(out_file, region_bt.fn):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                for r in region_bt:
                    if r.stop - r.start < max_size:
                        if num_plots < max_plots:
                            num_plots += 1
                            out_handle.write("%s\t%s\t%s\n" %
                                             (r.chrom, r.start, r.stop))
    return out_file