def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data, items = _get_batch_representative(items, "vrn_file") cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration(data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def get_noalt_contigs(data): """Retrieve contigs without alternatives as defined in bwa *.alts files. If no alt files present (when we're not aligning with bwa), work around with standard set of alts based on hg38 -- anything with HLA, _alt or _decoy in the name. """ alts = set([]) alt_files = [ f for f in tz.get_in(["reference", "bwa", "indexes"], data, []) if f.endswith("alt") ] if alt_files: for alt_file in alt_files: with open(alt_file) as in_handle: for line in in_handle: if not line.startswith("@"): alts.add(line.split()[0].strip()) else: for contig in ref.file_contigs(dd.get_ref_file(data)): if ("_alt" in contig.name or "_decoy" in contig.name or contig.name.startswith("HLA-") or ":" in contig.name): alts.add(contig.name) return [ c for c in ref.file_contigs(dd.get_ref_file(data)) if c.name not in alts ]
def run_vcfanno(vcf, anno_type, data): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ UNSUPPORTED_TYPE_MESSAGE = ( "{anno_type} is not a supported vcf annotation type with vcfanno. " "Supported types are {SUPPORTED_ANNOTATION_TYPES}") if anno_type not in SUPPORTED_ANNOTATION_TYPES: logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals())) return vcf build = dd.get_genome_build(data) annodir = os.path.dirname(dd.get_ref_file(data)) annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno")) annostem = os.path.join(annodir, build + "-") conffn = annostem + anno_type + ".conf" luafn = annostem + anno_type + ".lua" CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf base = os.path.splitext(vcf)[0] out_file = base + anno_type + "-annotated.vcf.gz" if utils.file_exists(out_file): return out_file basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.path.pardir)) basepath = annodir out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn) return out_file
def compare_to_rm(data): """Compare final variant calls against reference materials of known calls. """ toval_data = _get_validate(data) if toval_data: if isinstance(toval_data["vrn_file"], (list, tuple)): raise NotImplementedError("Multiple input files for validation: %s" % toval_data["vrn_file"]) else: vrn_file = os.path.abspath(toval_data["vrn_file"]) rm_file = normalize_input_path(toval_data["config"]["algorithm"]["validate"], toval_data) rm_interval_file = _gunzip(normalize_input_path(toval_data["config"]["algorithm"].get("validate_regions"), toval_data), toval_data) caller = _get_caller(toval_data) sample = dd.get_sample_name(toval_data) base_dir = utils.safe_makedir(os.path.join(toval_data["dirs"]["work"], "validate", sample, caller)) rm_file = naming.handle_synonyms(rm_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) rm_interval_file = (naming.handle_synonyms(rm_interval_file, dd.get_ref_file(data), data["genome_build"], base_dir, data) if rm_interval_file else None) vmethod = tz.get_in(["config", "algorithm", "validate_method"], data, "rtg") if vmethod == "rtg": eval_files = _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, toval_data) data["validate"] = _rtg_add_summary_file(eval_files, base_dir, toval_data) elif vmethod == "bcbio.variation": data["validate"] = _run_bcbio_variation(vrn_file, rm_file, rm_interval_file, base_dir, sample, caller, toval_data) return [[data]]
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ from bcbio.bam import callable if vrn_file and vcfutils.is_gvcf_file(vrn_file): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_sample_callable(data): return dd.get_sample_callable(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): data = utils.deepish_copy(data) data["work_bam"] = data.pop("work_bam_callable") return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = op.join(work_dir, "align", "seqs.bam") cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) sample["seqcluster"] = out_dir out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") novel_db = mirdeep.run(data) if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def run_region(data, region, vrn_files, out_file): """Perform variant calling on gVCF inputs in a specific genomic region. """ broad_runner = broad.runner_from_config(data["config"]) if broad_runner.gatk_type() == "gatk4": genomics_db = _run_genomicsdb_import(vrn_files, region, out_file, data) return _run_genotype_gvcfs_genomicsdb(genomics_db, region, out_file, data) else: vrn_files = _batch_gvcfs(data, region, vrn_files, dd.get_ref_file(data), out_file) return _run_genotype_gvcfs_gatk3(data, region, vrn_files, dd.get_ref_file(data), out_file)
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = utils.to_single_data(data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ work_dir = dd.get_work_dir(data[0][0]) out_dir = os.path.join(work_dir, "seqcluster", "cluster") out_dir = os.path.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = op.join(work_dir, "align", "seqs.bam") cluster_dir = _cluster(bam_file, prepare_dir, out_dir, dd.get_ref_file(data[0][0]), dd.get_srna_gtf_file(data[0][0])) report_file = _report(data[0][0], dd.get_ref_file(data[0][0])) for sample in data: sample[0]["seqcluster"] = out_dir return data
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False), orig_items) if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(data[vrn_key], data) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def check_bed_contigs(in_file, data): """Ensure BED file contigs match the reference genome. """ if not dd.get_ref_file(data): return contigs = set([]) with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith(("#", "track", "browser")) and line.strip(): contigs.add(line.split()[0]) ref_contigs = set([x.name for x in ref.file_contigs(dd.get_ref_file(data))]) if len(contigs - ref_contigs) / float(len(contigs)) > 0.25: raise ValueError("Contigs in BED file %s not in reference genome:\n %s\n" % (in_file, list(contigs - ref_contigs)) + "This is typically due to chr1 versus 1 differences in BED file and reference.")
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(vrn_file) sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data) ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data)) rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] do.run(cmd, "Validate calls using rtg vcfeval", data) return {"tp": os.path.join(out_dir, "tp.vcf.gz"), "fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")}
def _fill_prioritization_targets(data): """Fill in globally installed files for prioritization. """ ref_file = dd.get_ref_file(data) for target in [["svprioritize"]]: val = tz.get_in(["config", "algorithm"] + target, data) if val and not os.path.exists(val): installed_vals = glob.glob(os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", "prioritize", val + "*.bed.gz"))) if len(installed_vals) == 0: raise ValueError("Configuration problem. Prioritization file not found for %s: %s" % (target, val)) elif len(installed_vals) == 1: installed_val = installed_vals[0] else: # check for partial matches installed_val = None for v in installed_vals: if v.endswith(val + ".bed.gz"): installed_val = v break # handle date-stamped inputs if not installed_val: installed_val = sorted(installed_vals, reverse=True)[0] data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val) return data
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir, ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml") ) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".gvcf" num_cores = dd.get_num_cores(data) if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data with file_transaction(out_file) as tx_out_file: params = ["-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases", "-stand_call_conf", "20.0", "-stand_emit_conf", "20.0"] broad_runner.run_gatk(params) data = dd.set_vrn_file(data, out_file) return data
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def gatk_splitreads(data): """ use GATK to split reads with Ns in the CIGAR string, hard clipping regions that end up in introns """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) deduped_bam = dd.get_deduped_bam(data) base, ext = os.path.splitext(deduped_bam) split_bam = base + ".splitN" + ext if dd.get_quality_format(data) == "illumina": quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"] else: quality_flag = [] if file_exists(split_bam): data = dd.set_split_bam(data, split_bam) return data with file_transaction(split_bam) as tx_split_bam: params = ["-T", "SplitNCigarReads", "-R", ref_file, "-I", deduped_bam, "-o", tx_split_bam, "-rf", "ReassignOneMappingQuality", "-RMQF", "255", "-RMQT", "60", "-rf", "UnmappedRead", "-U", "ALLOW_N_CIGAR_READS"] + quality_flag broad_runner.run_gatk(params) bam.index(split_bam, dd.get_config(data)) data = dd.set_split_bam(data, split_bam) return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. """ umi_ext = "-cumi" if "umi_bam" in data else "" out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext)) num_cores = data["config"]["algorithm"].get("num_cores", 1) rg_info = novoalign.get_rg_info(names) preset = "sr" pair_file = pair_file if pair_file else "" if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): index_file = None # Skip trying to use indices now as they provide only slight speed-ups # and give inconsitent outputs in BAM headers # If a single index present, index_dir points to that # if index_dir and os.path.isfile(index_dir): # index_dir = os.path.dirname(index_dir) # index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset)) if not index_file or not os.path.exists(index_file): index_file = dd.get_ref_file(data) cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} " "{fastq_file} {pair_file} | ") do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data)) data["work_bam"] = out_file return data
def gatk_filter_rnaseq(vrn_file, data): """ this incorporates filters listed here, dropping clusters of variants within a 35 nucleotide window, high fischer strand values and low quality by depth https://software.broadinstitute.org/gatk/guide/article?id=3891 java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0" -filterName QD -filter "QD < 2.0" -o output.vcf """ out_file = "%s-filter%s" % utils.splitext_plus(vrn_file) if not file_exists(out_file): ref_file = dd.get_ref_file(data) with file_transaction(data, out_file) as tx_out_file: params = ["VariantFiltration", "-R", ref_file, "-V", vrn_file, "--cluster-window-size", "35", "--cluster-size", "3", "--filter-expression", "'FS > 30.0'", "--filter-name", "FS", "--filter-expression", "'QD < 2.0'", "--filter-name", "QD", "--output", tx_out_file] # Use GATK4 for filtering, tools_off is for variant calling config = utils.deepish_copy(dd.get_config(data)) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file)) do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.") return out_file
def run(data): config = data[0][0]['config'] work_dir = dd.get_work_dir(data[0][0]) genome = dd.get_ref_file(data[0][0]) mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl") perl_exports = get_perl_exports() mirbase = op.abspath(op.dirname(dd.get_mirbase_ref(data[0][0]))) species = dd.get_species(data[0][0]) hairpin = op.join(mirbase, "hairpin.fa") mature = op.join(mirbase, "mature.fa") rfam_file = op.join(mirbase, "Rfam_for_miRDeep.fa") bam_file = op.join(work_dir, "align", "seqs.bam") seqs_dir = op.join(work_dir, "seqcluster", "prepare") collapsed = op.join(seqs_dir, "seqs.ma") out_dir = op.join(work_dir, "mirdeep2") out_file = op.join(out_dir, "result_res.csv") safe_makedir(out_dir) with chdir(out_dir): collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir) cmd = ("{perl_exports} && {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -d -P -t {species} -z res").format(**locals()) if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(mature) and file_exists(rfam_file): do.run(cmd.format(**locals()), "Running mirdeep2.") if file_exists(out_file): novel_db = _parse_novel(out_file, dd.get_species(data[0][0])) return novel_db
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file)] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def _run_wham(inputs, background_bams): """Run WHAM on a defined set of inputs and targets. """ out_file = os.path.join(_sv_workdir(inputs[0]), "%s-wham.vcf.gz" % dd.get_sample_name(inputs[0])) if not utils.file_exists(out_file): with file_transaction(inputs[0], out_file) as tx_out_file: coords = chromhacks.autosomal_or_x_coords(dd.get_ref_file(inputs[0])) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": []} rs = run_multicore(_run_wham_coords, [(inputs, background_bams, coord, out_file) for coord in coords], inputs[0]["config"], parallel) rs = {coord: fname for (coord, fname) in rs} vcfutils.concat_variant_files([rs[c] for c in coords], tx_out_file, coords, dd.get_ref_file(inputs[0]), inputs[0]["config"]) return out_file
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _bed_to_platypusin(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] cmd += ["--assemble=1"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers cmd += ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", "--minVarFreq", "0.0"] # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not tz.get_in(["config", "algorithm", "mark_duplicates"], data, True) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = " | %s | vcfallelicprimitives | vcfstreamsort | bgzip -c > %s" % ( vcfutils.fix_ambiguous_cl(), tx_out_file) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def run_cluster(*data): """ Run seqcluster cluster to detect smallRNA clusters """ sample = data[0][0] tools = dd.get_expression_caller(data[0][0]) work_dir = dd.get_work_dir(sample) out_dir = op.join(work_dir, "seqcluster", "cluster") out_dir = op.abspath(safe_makedir(out_dir)) prepare_dir = op.join(work_dir, "seqcluster", "prepare") bam_file = data[0][0]["work_bam"] if "seqcluster" in tools: sample["seqcluster"] = _cluster(bam_file, data[0][0]["seqcluster_prepare_ma"], out_dir, dd.get_ref_file(sample), dd.get_srna_gtf_file(sample)) sample["report"] = _report(sample, dd.get_ref_file(sample)) out_mirna = _make_isomir_counts(data, out_dir=op.join(work_dir, "mirbase")) if out_mirna: sample = dd.set_mirna_counts(sample, out_mirna[0]) sample = dd.set_isomir_counts(sample, out_mirna[1]) out_novel = _make_isomir_counts(data, "seqbuster_novel", op.join(work_dir, "mirdeep2"), "_novel") if out_novel: sample = dd.set_novel_mirna_counts(sample, out_novel[0]) sample = dd.set_novel_isomir_counts(sample, out_novel[1]) data[0][0] = sample return data
def get_multisample_vcf(fnames, name, caller, data): """Retrieve a multiple sample VCF file in a standard location. Handles inputs with multiple repeated input files from batches. """ unique_fnames = [] for f in fnames: if f not in unique_fnames: unique_fnames.append(f) out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) if len(unique_fnames) > 1: gemini_vcf = os.path.join(out_dir, "%s-%s.vcf.gz" % (name, caller)) vrn_file_batch = None for variant in data.get("variants", []): if variant["variantcaller"] == caller and variant.get("vrn_file_batch"): vrn_file_batch = variant["vrn_file_batch"] if vrn_file_batch: utils.symlink_plus(vrn_file_batch, gemini_vcf) return gemini_vcf else: return vcfutils.merge_variant_files(unique_fnames, gemini_vcf, dd.get_ref_file(data), data["config"]) else: gemini_vcf = os.path.join(out_dir, "%s-%s%s" % (name, caller, utils.splitext_plus(unique_fnames[0])[1])) utils.symlink_plus(unique_fnames[0], gemini_vcf) return gemini_vcf
def get_analysis_intervals(data): """Retrieve analysis regions for the current variant calling pipeline. """ if data.get("ensemble_bed"): return data["ensemble_bed"] elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data) elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data) elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data) else: for key in ["callable_regions", "variant_regions"]: intervals = data["config"]["algorithm"].get(key) if intervals: return intervals
def sort_by_ref(vcf_file, data): """Sort a VCF file by genome reference and position, adding contig information. """ out_file = "%s-prep.vcf.gz" % utils.splitext_plus(vcf_file)[0] if not utils.file_uptodate(out_file, vcf_file): with file_transaction(data, out_file) as tx_out_file: header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cat_cmd = "zcat" if vcf_file.endswith("vcf.gz") else "cat" cmd = ( "{cat_cmd} {vcf_file} | grep -v ^##contig | bcftools annotate -h {header_file} | " "vt sort -m full -o {tx_out_file} -") with utils.chdir(os.path.dirname(tx_out_file)): do.run(cmd.format(**locals()), "Sort VCF by reference") return bgzip_and_index(out_file, data["config"])
def run_vcfanno(vcf, conf_files, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] build = dd.get_genome_build(data) basepath = os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conf_fns = [] lua_fns = [] anno_type = None for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file luafn = "%s.lua" % utils.splitext_plus(conffn)[0] else: anno_type = os.path.basename(conf_file) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") if not utils.file_exists(conffn): CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: conf_fns.append(conffn) lua_fns.append(luafn) if not conf_fns: return vcf if not anno_type: anno_type = "gemini" out_file = utils.splitext_plus( vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if utils.file_exists(out_file): return out_file out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns) return out_file
def run_sailfish(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None sailfish_dir = os.path.join(work_dir, "sailfish", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file stranded = dd.get_strandedness(data).lower() out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, sailfish_dir) return [[data]]
def prep_recal(data): """Do pre-BQSR recalibration, calculation of recalibration tables. """ if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Prepare BQSR tables with GATK: %s " % str(dd.get_sample_name(data))) dbsnp_file = tz.get_in(("genome_resources", "variation", "dbsnp"), data) if not dbsnp_file: logger.info("Skipping GATK BaseRecalibrator because no VCF file of known variants was found.") return data broad_runner = broad.runner_from_config(data["config"]) data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dd.get_align_bam(data), dd.get_ref_file(data), dd.get_platform(data), dbsnp_file, dd.get_variant_regions(data), data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Prepare BQSR tables with sentieon: %s " % str(dd.get_sample_name(data))) data["prep_recal"] = sentieon.bqsr_table(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) return data
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": params = [ "-T", "ApplyBQSRSpark", "--spark-master", "local[%s]" % cores, "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"], "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file) ] else: params = [ "-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file ] # Avoid problems with intel deflater for GATK 3.8 and GATK4 # https://github.com/chapmanb/bcbio-nextgen/issues/2145#issuecomment-343095357 if gatk_type == "gatk4": params += ["--jdk-deflater", "--jdk-inflater"] elif LooseVersion( broad_runner.gatk_major_version()) > LooseVersion("3.7"): params += ["-jdk_deflater", "-jdk_inflater"] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) bam.index(out_file, data["config"]) return out_file
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions", file_prefix=prefix) if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] do.run(cmd, "Calculate coverage: %s" % dd.get_sample_name(data)) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def _normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, remove_oldeffects=False): """Convert multi-allelic variants into single allelic. `vt normalize` has the -n flag passed (skipping reference checks) because of errors where the reference genome has non GATCN ambiguous bases. These are not supported in VCF, so you'll have a mismatch of N in VCF versus R (or other ambiguous bases) in the genome. """ if remove_oldeffects: out_file = "%s-noeff-decompose%s" % utils.splitext_plus(in_file) old_effects = [a for a in ["CSQ", "ANN"] if a in cyvcf2.VCF(in_file)] if old_effects: clean_effects_cmd = " | bcftools annotate -x %s " % (",".join( ["INFO/%s" % x for x in old_effects])) else: clean_effects_cmd = "" else: clean_effects_cmd = "" out_file = "%s-decompose%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): ref_file = dd.get_ref_file(data) assert out_file.endswith(".vcf.gz") with file_transaction(data, out_file) as tx_out_file: cmd = ("gunzip -c " + in_file + (" | bcftools view -f 'PASS,.'" if passonly else "") + clean_effects_cmd + (" | vcfallelicprimitives -t DECOMPOSED --keep-geno" if split_biallelic else "") + " | sed 's/ID=AD,Number=./ID=AD,Number=R/'" + " | vt decompose -s - " + ((" | vt normalize -n -r " + ref_file + " - ") if normalize_indels else "") + " | awk '{ gsub(\"./-65\", \"./.\"); print $0 }'" + " | sed -e 's/Number=A/Number=1/g'" + " | bgzip -c > " + tx_out_file) do.run(cmd, "Multi-allelic to single allele") return vcfutils.bgzip_and_index(out_file, data["config"])
def run(align_bams, items, ref_file, assoc_files, region, out_file): """Run platypus variant calling, germline whole genome or exome. """ assert out_file.endswith(".vcf.gz") if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, items[0]["config"]) cmd = ["platypus", "callVariants", "--regions=%s" % _subset_regions(region, out_file, items), "--bamFiles=%s" % ",".join(align_bams), "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-", "--logFileName", "/dev/null", "--verbosity=1"] resources = config_utils.get_resources("platypus", items[0]["config"]) if resources.get("options"): # normalize options so we can set defaults without overwriting user specified for opt in resources["options"]: if "=" in opt: key, val = opt.split("=") cmd.extend([key, val]) else: cmd.append(opt) if any("gvcf" in dd.get_tools_on(d) for d in items): cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"] # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers # Currently not used after doing more cross validation as they increase false positives # which seems to be a major advantage for Platypus users. # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9", # "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001", # "--minVarFreq", "0.0", "--assemble", "1"] # for okey, oval in utils.partition_all(2, tuned_opts): # if okey not in cmd: # cmd.extend([okey, oval]) # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates if any(not dd.get_mark_duplicates(data) for data in items): cmd += ["--filterDuplicates=0"] post_process_cmd = (" | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | " "vcfstreamsort | bgzip -c > %s" % (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5), tx_out_file)) do.run(" ".join(cmd) + post_process_cmd, "platypus variant calling") out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"]) return out_file
def _fill_prioritization_targets(data): """Fill in globally installed files for prioritization. """ ref_file = dd.get_ref_file(data) for target in [["svprioritize"], ["coverage"]]: val = tz.get_in(["config", "algorithm"] + target, data) if val and not os.path.exists(val): installed_vals = [] # Check prioritize directory for ext in [".bed", ".bed.gz"]: installed_vals += glob.glob( os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "coverage", "prioritize", val + "*%s" % ext))) # Check sv-annotation directory for prioritize gene name lists if target[-1] == "svprioritize": installed_vals += glob.glob( os.path.join( os.path.dirname( os.path.realpath( utils.which("simple_sv_annotation.py"))), "%s*" % os.path.basename(val))) if len(installed_vals) == 0: raise ValueError( "Configuration problem. BED file not found for %s: %s" % (target, val)) elif len(installed_vals) == 1: installed_val = installed_vals[0] else: # check for partial matches installed_val = None for v in installed_vals: if v.endswith(val + ".bed.gz") or v.endswith(val + ".bed"): installed_val = v break # handle date-stamped inputs if not installed_val: installed_val = sorted(installed_vals, reverse=True)[0] data = tz.update_in(data, ["config", "algorithm"] + target, lambda x: installed_val) return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) bed_file = dd.get_variant_regions(data) opts = " -c 1 -S 2 -E 3 -g 4 " resources = config_utils.get_resources("vardict", data) if resources.get("options"): opts += " ".join([str(x) for x in resources["options"]]) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.05 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, data["work_bam"], vrs or callable_file, "variant_regions") if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. This does not yet handle mapping over 1 -> chr1 issues since this requires a ton of search/replace which slows down conversion. """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_exists(out_file): out_file = os.path.join(work_dir, "%s-noextras.bam" % dd.get_sample_name(data)) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = _target_chroms_and_header(in_bam, data) str_chroms = " ".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable ref_file = dd.get_ref_file(data) local_bam = os.path.join(os.path.dirname(tx_out_file), os.path.basename(in_bam)) cores = dd.get_cores(data) utils.symlink_plus(in_bam, local_bam) bam.index(local_bam, data["config"]) cmd = ( "samtools view -@ {cores} -h {local_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{ref_file}")' | """ "samtools view -@ {cores} -u - | " "samtools addreplacerg -@ {cores} -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - " ) do.run( cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ genome_cov_thresh = 0.40 # percent of genome covered for whole genome analysis offtarget_thresh = 0.10 # percent of offtarget reads required to be capture (not amplification) based if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions(data) callable_file = dd.get_sample_callable(data) if vrs: seq_size = pybedtools.BedTool(vrs).total_coverage() else: seq_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([ c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"]) ]) genome_cov_pct = seq_size / float(total_size) if genome_cov_pct > genome_cov_thresh: cov_interval = "genome" offtarget_pct = 0.0 else: offtarget_stat_file = dd.get_offtarget_stats(data) if not offtarget_stat_file: offtarget_pct = 0.0 else: with open(offtarget_stat_file) as in_handle: stats = yaml.safe_load(in_handle) offtarget_pct = stats["offtarget"] / float(stats["mapped"]) if offtarget_pct > offtarget_thresh: cov_interval = "regional" else: cov_interval = "amplicon" logger.info( "Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(items[0], raw_file) as tx_raw_file: cmd = [sys.executable, tovcf_script, "-c", sconfig_file, "-f", dd.get_ref_file(items[0]), "-t", "LUMPY", "-b", bedpe_file, "-o", tx_raw_file] do.run(cmd, "Convert lumpy bedpe output to VCF") clean_file = _clean_lumpy_vcf(raw_file, items[0]) prep_file = vcfutils.sort_by_ref(clean_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def _rnaseq_qualimap(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ report_file = os.path.join(out_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) ref_file = dd.get_ref_file(data) single_end = not bam.is_paired(bam_file) if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end) do.run(cmd, "Qualimap for {}".format(data["name"][-1])) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def tobam_cl(data, out_file, is_paired=False): """Prepare command line for producing de-duplicated sorted output. - If no deduplication, sort and prepare a BAM file. - If paired, then use samblaster and prepare discordant outputs. - If unpaired, use biobambam's bammarkduplicates """ do_dedup = _check_dedup(data) with file_transaction(data, out_file) as tx_out_file: if not do_dedup: yield (sam_to_sortbam_cl(data, tx_out_file), tx_out_file) elif is_paired and not _too_many_contigs(dd.get_ref_file(data)): sr_file = "%s-sr.bam" % os.path.splitext(out_file)[0] disc_file = "%s-disc.bam" % os.path.splitext(out_file)[0] with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: yield (samblaster_dedup_sort(data, tx_out_file, tx_sr_file, tx_disc_file), tx_out_file) else: yield (_biobambam_dedup_sort(data, tx_out_file), tx_out_file)
def _maybe_limit_chromosomes(data): """Potentially limit chromosomes to avoid problematically named HLA contigs. HLAs have ':' characters in them which confuse downstream processing. If we have no problematic chromosomes we don't limit anything. """ std_chroms = [] prob_chroms = [] noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) for contig in ref.file_contigs(dd.get_ref_file(data)): if contig.name.find(":") > 0 or ( noalt_calling and not chromhacks.is_nonalt(contig.name)): prob_chroms.append(contig.name) else: std_chroms.append(contig.name) if len(prob_chroms) > 0: return std_chroms else: return []
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can " "add support for this if you open up an issue about it here: " "https://github.com/bcbio/bcbio-nextgen/issues") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def remove_nonassembled_chrom(bam_file, data): """Remove non-assembled contigs from the BAM file""" ref_file = dd.get_ref_file(data) config = dd.get_config(data) fai = "%s.fai" % ref_file chrom = [] with open(fai) as inh: for line in inh: c = line.split("\t")[0] if c.find("_") < 0: chrom.append(c) chroms = " ".join(chrom) out_file = utils.append_stem(bam_file, '_chrom') samtools = config_utils.get_program("samtools", config) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out: cmd = "{samtools} view -b {bam_file} {chroms} > {tx_out}" do.run(cmd.format(**locals()), "Remove contigs from %s" % bam_file) bam.index(out_file, config) return out_file
def prepare_intervals(data, region_file, work_dir): """Prepare interval regions for targeted and gene based regions. """ target_file = os.path.join( work_dir, "%s-target.interval_list" % dd.get_sample_name(data)) if not utils.file_uptodate(target_file, region_file): with file_transaction(data, target_file) as tx_out_file: params = [ "-T", "PreprocessIntervals", "-R", dd.get_ref_file(data), "--interval-merging-rule", "OVERLAPPING_ONLY", "-O", tx_out_file ] if dd.get_coverage_interval(data) == "genome": params += ["--bin-length", "1000", "--padding", "0"] else: params += [ "-L", region_file, "--bin-length", "0", "--padding", "250" ] _run_with_memory_scaling(params, tx_out_file, data) return target_file
def add_dbsnp(orig_file, dbsnp_file, data, out_file=None): """Annotate a VCF file with dbSNP. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) if out_file is None: out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(data, out_file) as tx_out_file: conf_file = os.path.join(os.path.dirname(tx_out_file), "dbsnp.conf") with open(conf_file, "w") as out_handle: out_handle.write('[[annotation]]\n') out_handle.write('file="%s"\n' % os.path.normpath(os.path.join(dd.get_work_dir(data), dbsnp_file))) out_handle.write('fields=["ID"]\n') out_handle.write('names=["rs_ids"]\n') out_handle.write('ops=["concat"]\n') ref_file = dd.get_ref_file(data) cmd = ("vcfanno {conf_file} {orig_file} | " "bcftools annotate --set-id +'%INFO/rs_ids' -o {tx_out_file} -O z") do.run(cmd.format(**locals()), "Annotate with dbSNP") return vcfutils.bgzip_and_index(out_file, data["config"])
def run_rnaseq_joint_genotyping(*samples): data = samples[0][0] variantcaller = dd.get_variantcaller(data) if not variantcaller: return samples if "gatk" not in variantcaller: return samples ref_file = dd.get_ref_file(data) if variantcaller and "gatk" in variantcaller: vrn_files = [ dd.get_vrn_file(d) for d in dd.sample_data_iterator(samples) ] out_file = variation.gatk_joint_calling(data, vrn_files, ref_file) vrn_file = vcfanno.run_vcfanno(out_file, ["rnaedit"], data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_square_vcf(data, vrn_file) updated_samples.append([data]) return updated_samples return samples
def variantcall_batch_region(items): """CWL entry point: variant call a batch of samples in a region. """ items = [utils.to_single_data(x) for x in items] align_bams = [dd.get_align_bam(x) for x in items] variantcaller = _get_batch_variantcaller(items) region = list(set([x.get("region") for x in items if "region" in x])) assert len(region) == 1, region region = region[0] caller_fn = get_variantcallers()[variantcaller] assoc_files = tz.get_in(("genome_resources", "variation"), items[0], {}) region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) batch_name = _get_batch_name(items) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) utils.safe_makedir(os.path.dirname(out_file)) call_file = caller_fn(align_bams, items, dd.get_ref_file(items[0]), assoc_files, region, out_file) return {"vrn_file_region": call_file, "region": "%s:%s-%s" % (chrom, start, end)}
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ sr_file = os.path.join( work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join( work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) if not utils.file_exists(sr_file) or not utils.file_exists(disc_file): with file_transaction(data, sr_file) as tx_sr_file: with file_transaction(data, disc_file) as tx_disc_file: cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("extract-sv-reads -e --threads {cores} -T {ref_file} " "-i {in_bam} -s {tx_sr_file} -d {tx_disc_file}") do.run(cmd.format(**locals()), "extract split and discordant reads", data) for fname in [sr_file, disc_file]: bam.index(fname, data["config"]) return sr_file, disc_file
def cnvkit_background(background_cnns, out_file, items, target_bed=None, antitarget_bed=None): """Calculate background reference, handling flat case with no normal sample. """ if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cmd = [_get_cmd(), "reference", "-f", dd.get_ref_file(items[0]), "-o", tx_out_file] genders = set([population.get_gender(x) for x in items]) genders.discard("unknown") if len(genders) == 1: gender = genders.pop() cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] if len(background_cnns) == 0: assert target_bed and antitarget_bed, "Missing CNNs and target BEDs for flat background" cmd += ["-t", target_bed, "-a", antitarget_bed] else: cmd += background_cnns do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background") return out_file
def _bgzip_from_cram_sambamba(cram_file, dirs, data): """Use sambamba to extract from CRAM via regions. """ raise NotImplementedError( "sambamba doesn't yet support retrieval from CRAM by BED file") region_file = (tz.get_in(["config", "algorithm", "variant_regions"], data) if tz.get_in(["config", "algorithm", "coverage_interval"], data) in ["regional", "exome"] else None) base_name = utils.splitext_plus(os.path.basename(cram_file))[0] work_dir = utils.safe_makedir( os.path.join(dirs["work"], "align_prep", "%s-parts" % base_name)) f1, f2, o1, o2, si = [ os.path.join(work_dir, "%s.fq" % x) for x in ["match1", "match2", "unmatch1", "unmatch2", "single"] ] ref_file = dd.get_ref_file(data) region = "-L %s" % region_file if region_file else "" cmd = ("sambamba view -f bam -l 0 -C {cram_file} -T {ref_file} {region} | " "bamtofastq F={f1} F2={f2} S={si} O={o1} O2={o2}") do.run(cmd.format(**locals()), "Convert CRAM to fastq in regions")
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def add_genes(in_file, data, max_distance=10000, work_dir=None): """Add gene annotations to a BED file from pre-prepared RNA-seq data. max_distance -- only keep annotations within this distance of event """ gene_file = regions.get_sv_bed(data, "exons", out_dir=os.path.dirname(in_file)) if gene_file and utils.file_exists(in_file): out_file = "%s-annotated.bed" % utils.splitext_plus(in_file)[0] if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_uptodate(out_file, in_file): fai_file = ref.fasta_idx(dd.get_ref_file(data)) with file_transaction(data, out_file) as tx_out_file: add_genes_to_bed(in_file, gene_file, fai_file, tx_out_file, max_distance) return out_file else: return in_file
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller( data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")), "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) out_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, out_file=out_file) return dd.set_vrn_file(data, out_file)
def variants(data): if not "vrn_file" in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = data['work_bam'] ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = data.get("work_bam") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) return data