def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def organize(dirs, config, run_info_yaml): """Organize run information from a passed YAML file or the Galaxy API. Creates the high level structure used for subsequent processing. """ logger.info("Using input YAML configuration: %s" % run_info_yaml) assert run_info_yaml and os.path.exists(run_info_yaml), \ "Did not find input sample YAML file: %s" % run_info_yaml run_details = _run_info_from_yaml(dirs["flowcell"], run_info_yaml, config) out = [] for item in run_details: # add algorithm details to configuration, avoid double specification item["config"] = config_utils.update_w_custom(config, item) item.pop("algorithm", None) item["dirs"] = dirs if "name" not in item: item["name"] = ["", item["description"]] elif isinstance(item["name"], basestring): description = "%s-%s" % (item["name"], clean_name(item["description"])) item["name"] = [item["name"], description] item["description"] = description item = add_reference_resources(item) # Create temporary directories and make absolute if utils.get_in(item, ("config", "resources", "tmp", "dir")): utils.safe_makedir(utils.get_in(item, ("config", "resources", "tmp", "dir"))) item["config"]["resources"]["tmp"] = genome.abs_file_paths( utils.get_in(item, ("config", "resources", "tmp"))) out.append(item) return out
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} bytype_vcfs = run_multicore(_run_delly, [(work_bams, sv_type, ref_file, work_dir, items) for sv_type in ["DEL", "DUP", "INV", "TRA"]], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outSAMunmapped Within") cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif" run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file) do.run(cmd.format(**locals()), run_message, None) out_file = bam.sam_to_bam(out_file, config) out_file = _fix_sam_header(out_file, config) if not file_exists(final_out): symlink_plus(out_file, final_out) return final_out
def _start_processing(dname, sample_file, config): """Initiate processing: on a remote server or locally on a cluster. """ to_remote = _remap_dirname(dname, os.path.join(utils.get_in(config, ("process", "dir")), os.path.basename(dname))) args = { "work_dir": to_remote(os.path.join(dname, "analysis")), "run_config": to_remote(sample_file), "fc_dir": to_remote(dname), } # call a remote server if utils.get_in(config, ("process", "server")): print "%s/run?args=%s" % (utils.get_in(config, ("process", "server")), json.dumps(args)) requests.get(url="%s/run" % utils.get_in(config, ("process", "server")), params={"args": json.dumps(args)}) # submit to a cluster scheduler elif "submit_cmd" in config["process"] and "bcbio_batch" in config["process"]: with utils.chdir(utils.safe_makedir(args["work_dir"])): batch_script = "submit_bcbio.sh" with open(batch_script, "w") as out_handle: out_handle.write( config["process"]["bcbio_batch"].format(fcdir=args["fc_dir"], run_config=args["run_config"]) ) submit_cmd = utils.get_in(config, ("process", "submit_cmd")) subprocess.check_call(submit_cmd.format(batch_script=batch_script), shell=True) else: raise ValueError("Unexpected processing approach: %s" % config["process"])
def copy_flowcell(dname, fastq_dir, sample_cfile, config): """Copy required files for processing using rsync, potentially to a remote server. """ with utils.chdir(dname): reports = reduce(operator.add, [glob.glob("*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xml"), glob.glob("Data/Intensities/BaseCalls/*.xsl"), glob.glob("Data/Intensities/BaseCalls/*.htm"), ["Data/Intensities/BaseCalls/Plots", "Data/reports", "Data/Status.htm", "Data/Status_Files", "InterOp"]]) run_info = reduce(operator.add, [glob.glob("run_info.yaml"), glob.glob("*.csv")]) fastq = glob.glob(os.path.join(fastq_dir.replace(dname + "/", "", 1), "*.gz")) configs = [sample_cfile.replace(dname + "/", "", 1)] include_file = os.path.join(dname, "transfer_files.txt") with open(include_file, "w") as out_handle: out_handle.write("+ */\n") for fname in configs + fastq + run_info + reports: out_handle.write("+ %s\n" % fname) out_handle.write("- *\n") # remote transfer if utils.get_in(config, ("process", "host")): dest = "%s@%s:%s" % (utils.get_in(config, ("process", "username")), utils.get_in(config, ("process", "host")), utils.get_in(config, ("process", "dir"))) # local transfer else: dest = utils.get_in(config, ("process", "dir")) cmd = ["rsync", "-akmrtv", "--include-from=%s" % include_file, dname, dest] logger.info("Copying files to analysis machine") logger.info(" ".join(cmd)) subprocess.check_call(cmd)
def _af_filter(data, in_file, out_file): """Soft-filter variants with AF below min_allele_fraction (appends "MinAF" to FILTER) """ min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering MuTect2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = cyvcf2.VCF(in_file) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = cyvcf2.Writer(tx_out_file, vcf) # GATK 3.x can produce VCFs without sample names for empty VCFs try: tumor_index = vcf.samples.index(dd.get_sample_name(data)) except ValueError: tumor_index = None for rec in vcf: if tumor_index is not None and np.all(rec.format('AF')[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _shared_gatk_call_prep(align_bams, items, ref_file, dbsnp, region, out_file): """Shared preparation work for GATK variant calling. """ config = items[0]["config"] broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, config) # GATK can only downsample to a minimum of 200 coverage_depth_max = max(200, utils.get_in(config, ("algorithm", "coverage_depth_max"), 2000)) coverage_depth_min = utils.get_in(config, ("algorithm", "coverage_depth_min"), 4) variant_regions = config["algorithm"].get("variant_regions", None) confidence = "4.0" if coverage_depth_min < 4 else "30.0" region = subset_variant_regions(variant_regions, region, out_file, items) params = ["-R", ref_file, "--standard_min_confidence_threshold_for_calling", confidence, "--standard_min_confidence_threshold_for_emitting", confidence, "--downsample_to_coverage", str(coverage_depth_max), "--downsampling_type", "BY_SAMPLE", ] for a in annotation.get_gatk_annotations(config): params += ["--annotation", a] for x in align_bams: params += ["-I", x] if dbsnp: params += ["--dbsnp", dbsnp] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def get_db(data): """Retrieve a snpEff database name and location relative to reference file. """ snpeff_db = utils.get_in(data, ("genome_resources", "aliases", "snpeff")) snpeff_base_dir = None if snpeff_db: snpeff_base_dir = utils.get_in(data, ("reference", "snpeff")) if not (isinstance(snpeff_base_dir, six.string_types) and os.path.isdir(snpeff_base_dir)): snpeff_base_dir = utils.get_in(data, ("reference", "snpeff", snpeff_db)) if not snpeff_base_dir: # We need to mask '.' characters for CWL/WDL processing, check for them here snpeff_base_dir = utils.get_in(data, ("reference", "snpeff", snpeff_db.replace(".", "_"))) if snpeff_base_dir: snpeff_db = snpeff_db.replace("_", ".") if isinstance(snpeff_base_dir, dict) and snpeff_base_dir.get("base"): snpeff_base_dir = snpeff_base_dir["base"] if (snpeff_base_dir and isinstance(snpeff_base_dir, six.string_types) and os.path.isfile(snpeff_base_dir)): snpeff_base_dir = os.path.dirname(snpeff_base_dir) if (snpeff_base_dir and isinstance(snpeff_base_dir, six.string_types) and snpeff_base_dir.endswith("%s%s" % (os.path.sep, snpeff_db))): snpeff_base_dir = os.path.dirname(snpeff_base_dir) if not snpeff_base_dir: ref_file = utils.get_in(data, ("reference", "fasta", "base")) snpeff_base_dir = utils.safe_makedir(os.path.normpath(os.path.join( os.path.dirname(os.path.dirname(ref_file)), "snpeff"))) # back compatible retrieval of genome from installation directory if "config" in data and not os.path.exists(os.path.join(snpeff_base_dir, snpeff_db)): snpeff_base_dir, snpeff_db = _installed_snpeff_genome(snpeff_db, data["config"]) if snpeff_base_dir.endswith("/%s" % snpeff_db): snpeff_base_dir = os.path.dirname(snpeff_base_dir) return snpeff_db, snpeff_base_dir
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome, base_dir, data): """Create a bcbio.variation configuration input for validation. """ if rm_genome: rm_genome = utils.get_in(data, ("reference", "alt", rm_genome, "base")) if rm_genome and rm_genome != utils.get_in(data, ("reference", "fasta", "base")): eval_genome = utils.get_in(data, ("reference", "fasta", "base")) else: rm_genome = utils.get_in(data, ("reference", "fasta", "base")) eval_genome = None ref_call = {"file": str(rm_file), "name": "ref", "type": "grading-ref", "preclean": True, "prep": True, "remove-refcalls": True} a_intervals = get_analysis_intervals(data) if rm_interval_file: ref_call["intervals"] = rm_interval_file eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True} if eval_genome: eval_call["ref"] = eval_genome eval_call["preclean"] = True eval_call["prep"] = True if a_intervals and eval_genome: eval_call["intervals"] = os.path.abspath(a_intervals) exp = {"sample": data["name"][-1], "ref": rm_genome, "approach": "grade", "calls": [ref_call, eval_call]} if a_intervals and not eval_genome: exp["intervals"] = os.path.abspath(a_intervals) if data.get("callable_bam") and not eval_genome: exp["align"] = data["callable_bam"] return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"}, "experiments": [exp]}
def should_run_fusion(with_caller, config): fusion_mode = dd.get_fusion_mode(config) or \ utils.get_in(config, ("algorithm", "fusion_mode"), False) fusion_caller = dd.get_fusion_caller(config) or \ utils.get_in(config, ("algorithm", "fusion_caller"), None) return fusion_mode and fusion_caller in (None, with_caller)
def _scalpel_options_from_config(items, config, out_file, region, tmp_path): opts = [] opts += ["--format", "vcf", "--intarget"] # output vcf, report only variants within bed regions variant_regions = utils.get_in(config, ("algorithm", "variant_regions")) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--bed", target] else: tmp_bed = os.path.join(tmp_path, "tmp.bed") with file_transaction(tmp_bed) as tx_tmp_bed: if not isinstance(region, (list, tuple)): message = ("Region must be a tuple - something odd just happened") raise ValueError(message) chrom, start, end = region print("%s\t%s\t%s" % (chrom, start, end), file=tx_tmp_bed) opts += ["--bed", tmp_bed] resources = config_utils.get_resources("scalpel", config) if resources.get("options"): opts += resources["options"] if "--outratio" not in " ".join(opts): # add minimum reportable allele frequency, for which Scalpel defaults to 5 # but other somatic tools in bcbio default to 10 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 opts += ["--outratio", str(min_af)] return opts
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s." % (fastq_file, ref_file) with file_transaction(final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) return final_out
def _SID_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for SomaticIndelDetector. """ base_config = items[0]["config"] for x in align_bams: bam.index(x, base_config) params = ["-R", ref_file, "-T", "SomaticIndelDetector", "-U", "ALLOW_N_CIGAR_READS"] # Limit per base read start count to between 200-10000, i.e. from any base # can no more 10000 new reads begin. # Further, limit maxNumberOfReads accordingly, otherwise SID discards # windows for high coverage panels. window_size = 200 # default SID value paired = vcfutils.get_paired_bams(align_bams, items) max_depth = min(max(200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000)), 10000) params += ["--downsample_to_coverage", max_depth] params += ["--maxNumberOfReads", str(int(max_depth) * window_size)] params += ["--read_filter", "NotPrimaryAlignment"] params += ["-I:tumor", paired.tumor_bam] min_af = float(get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] # notice there must be at least 4 reads of coverage in normal params += ["--filter_expressions", "T_COV<6||N_COV<4||T_INDEL_F<%s||T_INDEL_CF<0.7" % min_af] else: params += ["--unpaired"] params += ["--filter_expressions", "COV<6||INDEL_F<%s||INDEL_CF<0.7" % min_af] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return params
def add_reference_resources(data, remote_retriever=None): """Add genome reference information to the item to process. """ aligner = data["config"]["algorithm"].get("aligner", None) if remote_retriever: data["reference"] = remote_retriever.get_refs(data["genome_build"], aligner, data["config"]) else: data["reference"] = genome.get_refs(data["genome_build"], aligner, data["dirs"]["galaxy"], data) _check_ref_files(data["reference"], data) # back compatible `sam_ref` target data["sam_ref"] = utils.get_in(data, ("reference", "fasta", "base")) ref_loc = utils.get_in(data, ("config", "resources", "species", "dir"), utils.get_in(data, ("reference", "fasta", "base"))) if remote_retriever: data = remote_retriever.get_resources(data["genome_build"], ref_loc, data) else: data["genome_resources"] = genome.get_resources(data["genome_build"], ref_loc, data) if effects.get_type(data) == "snpeff" and "snpeff" not in data["reference"]: data["reference"]["snpeff"] = effects.get_snpeff_files(data) data = _fill_validation_targets(data) data = _fill_prioritization_targets(data) # Re-enable when we have ability to re-define gemini configuration directory if False: if population.do_db_build([data], need_bam=False): data["reference"]["gemini"] = population.get_gemini_files(data) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def get_lcr_bed(items): lcr_bed = utils.get_in(items[0], ("genome_resources", "variation", "lcr")) do_lcr = any([ utils.get_in(data, ("config", "algorithm", "remove_lcr"), False) for data in items ]) if do_lcr and lcr_bed and os.path.exists(lcr_bed): return lcr_bed
def _debug_samples(i, samples): print "---", i, len(samples) for sample in (x[0] for x in samples): print " ", sample["description"], sample.get("region"), \ utils.get_in(sample, ("config", "algorithm", "variantcaller")), \ utils.get_in(sample, ("config", "algorithm", "jointcaller")), \ [x.get("variantcaller") for x in sample.get("variants", [])], \ sample.get("work_bam")
def _debug_samples(i, samples): print("---", i, len(samples)) for sample in (utils.to_single_data(x) for x in samples): print(" ", sample["description"], sample.get("region"), \ utils.get_in(sample, ("config", "algorithm", "variantcaller")), \ utils.get_in(sample, ("config", "algorithm", "jointcaller")), \ utils.get_in(sample, ("metadata", "batch")), \ [x.get("variantcaller") for x in sample.get("variants", [])], \ sample.get("work_bam"), \ sample.get("vrn_file"))
def get_max_counts(samples): """Retrieve the maximum region size from a set of callable regions """ bed_files = list(set(utils.get_in(x[0], ("config", "algorithm", "callable_regions")) for x in samples)) bed_files = filter(lambda x: x is not None, bed_files) if not bed_files: bed_files = list(set(utils.get_in(x[0], ("config", "algorithm", "variant_regions")) for x in samples)) return max(sum(1 for line in open(f)) for f in bed_files if f)
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: for align_bam in align_bams: bam.index(align_bam, config) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands vardict = config_utils.get_program("vardict", config) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = " ".join(_vardict_options_from_config(items, config, out_file, region)) vcfallelicprimitives = config_utils.get_program("vcfallelicprimitives", config) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if coverage_interval == "regional" else "" fix_ambig = vcfutils.fix_ambiguous_cl() sample = item["name"][1] cmd = ("{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {vcfallelicprimitives} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists(ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = cyvcf2.VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.'}) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq*100) + ( '(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)')}) w = cyvcf2.Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format(rec.ALT[0] + 'U')[:,0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:,0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:,1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:,0:], axis=1)[:, None] else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _start_processing(dname, sample_file, config): """Initiate processing on the remote server. """ to_remote = _remap_dirname(dname, os.path.join(utils.get_in(config, ("process", "dir")), os.path.basename(dname))) args = {"work_dir": to_remote(os.path.join(dname, "analysis")), "run_config": to_remote(sample_file), "fc_dir": to_remote(dname)} print "%s/run?args=%s" % (utils.get_in(config, ("process", "server")), json.dumps(args)) requests.get(url="%s/run" % utils.get_in(config, ("process", "server")), params={"args": json.dumps(args)})
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " if dd.get_rsem(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def remove_lcr_regions(orig_bed, items): """If configured and available, update a BED file to remove low complexity regions. """ lcr_bed = utils.get_in(items[0], ("genome_resources", "variation", "lcr")) do_lcr = any([utils.get_in(data, ("config", "algorithm", "remove_lcr"), False) for data in items]) if lcr_bed and do_lcr and os.path.exists(lcr_bed): nolcr_bed = os.path.join("%s-nolcr.bed" % (utils.splitext_plus(orig_bed)[0])) with file_transaction(nolcr_bed) as tx_nolcr_bed: pybedtools.BedTool(orig_bed).subtract(pybedtools.BedTool(lcr_bed)).saveas(tx_nolcr_bed) # If we have a non-empty file, convert to the LCR subtracted for downstream analysis if utils.file_exists(nolcr_bed): orig_bed = nolcr_bed return orig_bed
def _set_transcriptome_option(options, data, ref_file): # prefer transcriptome-index vs a GTF file if available transcriptome_index = get_in(data, ("genome_resources", "rnaseq", "transcriptome_index", "tophat")) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if transcriptome_index and file_exists(transcriptome_index) and not fusion_mode: options["transcriptome-index"] = os.path.splitext(transcriptome_index)[0] return options gtf_file = data["genome_resources"]["rnaseq"].get("transcripts") if gtf_file: options["GTF"] = gtf_file return options return options
def get_db(data): """Retrieve a snpEff database name and location relative to reference file. """ snpeff_db = utils.get_in(data, ("genome_resources", "aliases", "snpeff")) snpeff_base_dir = None if snpeff_db: snpeff_base_dir = utils.get_in(data, ("reference", "snpeff", snpeff_db, "base")) if not snpeff_base_dir: ref_file = utils.get_in(data, ("reference", "fasta", "base")) snpeff_base_dir = utils.safe_makedir(os.path.normpath(os.path.join( os.path.dirname(os.path.dirname(ref_file)), "snpeff"))) # back compatible retrieval of genome from installation directory if "config" in data and not os.path.exists(os.path.join(snpeff_base_dir, snpeff_db)): snpeff_base_dir, snpeff_db = _installed_snpeff_genome(snpeff_db, data["config"]) return snpeff_db, snpeff_base_dir
def _get_variant_file(x, key, suffix="", sample=None): """Retrieve VCF file with the given key if it exists, handling bgzipped. """ out = [] fname = utils.get_in(x, key) upload_key = list(key) upload_key[-1] = "do_upload" do_upload = tz.get_in(tuple(upload_key), x, True) if fname and do_upload: if fname.endswith(".vcf.gz"): out.append({"path": fname, "type": "vcf.gz", "ext": "%s%s" % (x["variantcaller"], suffix), "variantcaller": x["variantcaller"]}) if utils.file_exists(fname + ".tbi"): out.append({"path": fname + ".tbi", "type": "vcf.gz.tbi", "index": True, "ext": "%s%s" % (x["variantcaller"], suffix), "variantcaller": x["variantcaller"]}) elif fname.endswith((".vcf", ".bed", ".bedpe", ".bedgraph", ".cnr", ".cns", ".cnn", ".txt", ".tsv")): ftype = utils.splitext_plus(fname)[-1][1:] if ftype == "txt": extended_ftype = fname.split("-")[-1] if "/" not in extended_ftype: ftype = extended_ftype out.append({"path": fname, "type": ftype, "ext": "%s%s" % (x["variantcaller"], suffix), "variantcaller": x["variantcaller"]}) if sample: out_sample = [] for x in out: x["sample"] = sample out_sample.append(x) return out_sample else: return out
def _run_qc_tools(bam_file, data): """Run a set of third party quality control tools, returning QC directory and metrics. """ to_run = [("fastqc", _run_fastqc)] if data["analysis"].lower() == "rna-seq": to_run.append(("rnaseqc", bcbio.rnaseq.qc.sample_summary)) to_run.append(("complexity", _run_complexity)) elif data["analysis"].lower() == "chip-seq": to_run.append(["bamtools", _run_bamtools_stats]) else: to_run += [("bamtools", _run_bamtools_stats), ("gemini", _run_gemini_stats)] qc_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "qc", data["name"][-1])) metrics = {} for program_name, qc_fn in to_run: cur_qc_dir = os.path.join(qc_dir, program_name) cur_metrics = qc_fn(bam_file, data, cur_qc_dir) metrics.update(cur_metrics) metrics["Name"] = data["name"][-1] metrics["Quality format"] = utils.get_in( data, ("config", "algorithm", "quality_format"), "standard").lower() return {"qc": qc_dir, "metrics": metrics}
def gatk_snp_hard(in_file, data): """Perform hard filtering on GATK SNPs using best-practice recommendations. We have a more lenient mapping quality (MQ) filter compared to GATK defaults. The recommended filter (MQ < 40) is too stringent, so we adjust to 30: http://imgur.com/a/oHRVB QD and FS are not calculated when generating gVCF output: https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300 """ filters = ["MQ < 30.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 60.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"), "gatk") if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") return hard_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKHardSNP", "SNP")
def hard_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd="", limit_regions="variant_regions"): """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" variant_regions = (utils.get_in(data, ("config", "algorithm", "variant_regions")) if limit_regions == "variant_regions" else None) intervals = ("-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) if variant_regions else "") cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Hard filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _collapse_by_bam_variantcaller(samples): """Collapse regions to a single representative by BAM input, variant caller and batch. """ by_bam = collections.OrderedDict() for data in (x[0] for x in samples): work_bam = utils.get_in(data, ("combine", "work_bam", "out"), data.get("align_bam")) variantcaller = get_variantcaller(data) if isinstance(work_bam, list): work_bam = tuple(work_bam) key = (multi.get_batch_for_key(data), work_bam, variantcaller) try: by_bam[key].append(data) except KeyError: by_bam[key] = [data] out = [] for grouped_data in by_bam.values(): cur = grouped_data[0] cur.pop("region", None) region_bams = cur.pop("region_bams", None) if region_bams and len(region_bams[0]) > 1: cur.pop("work_bam", None) out.append([cur]) return out
def _submit_and_wait(cmd, cores, config, output_dir): """Submit command with batch script specified in configuration, wait until finished """ batch_script = "submit_bcl2fastq.sh" if not os.path.exists(batch_script + ".finished"): if os.path.exists(batch_script + ".failed"): os.remove(batch_script + ".failed") with open(batch_script, "w") as out_handle: out_handle.write(config["process"]["bcl2fastq_batch"].format( cores=cores, bcl2fastq_cmd=" ".join(cmd), batch_script=batch_script)) submit_cmd = utils.get_in(config, ("process", "submit_cmd")) subprocess.check_call(submit_cmd.format(batch_script=batch_script), shell=True) # wait until finished or failure checkpoint file while 1: if os.path.exists(batch_script + ".finished"): break if os.path.exists(batch_script + ".failed"): raise ValueError("bcl2fastq batch script failed: %s" % os.path.join(output_dir, batch_script)) time.sleep(5)
def _get_vcf(x, key): """Retrieve VCF file with the given key if it exists, handling bgzipped. """ out = [] fname = utils.get_in(x, key) if fname: if fname.endswith(".gz"): out.append({"path": fname, "type": "vcf.gz", "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) if utils.file_exists(fname + ".tbi"): out.append({"path": fname + ".tbi", "type": "vcf.gz.tbi", "index": True, "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) else: out.append({"path": fname, "type": "vcf", "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) return out
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join( work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join( work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join( work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists( disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir(data) as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort( data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join( tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ( "{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Preparation work for MuTect. """ base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") _check_mutect_version(broad_runner) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) paired = vcfutils.get_paired_bams(align_bams, items) params = ["-R", ref_file, "-T", "MuTect", "-U", "ALLOW_N_CIGAR_READS"] params += [ "--downsample_to_coverage", max( 200, get_in(paired.tumor_config, ("algorithm", "coverage_depth_max"), 10000)) ] params += [ "--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment" ] params += ["-I:tumor", paired.tumor_bam] params += ["--tumor_sample_name", paired.tumor_name] if paired.normal_bam is not None: params += ["-I:normal", paired.normal_bam] params += ["--normal_sample_name", paired.normal_name] if paired.normal_panel is not None: params += ["--normal_panel", paired.normal_panel] params += _config_params(base_config, assoc_files, region, out_file) return broad_runner, params
def _get_variant_file(x, key): """Retrieve VCF file with the given key if it exists, handling bgzipped. """ out = [] fname = utils.get_in(x, key) if fname: if fname.endswith(".vcf.gz"): out.append({"path": fname, "type": "vcf.gz", "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) if utils.file_exists(fname + ".tbi"): out.append({"path": fname + ".tbi", "type": "vcf.gz.tbi", "index": True, "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) elif fname.endswith((".vcf", ".bed", ".bedpe")): ftype = utils.splitext_plus(fname)[-1][1:] out.append({"path": fname, "type": ftype, "ext": x["variantcaller"], "variantcaller": x["variantcaller"]}) return out
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def _get_variant_file(x, key): """Retrieve VCF file with the given key if it exists, handling bgzipped. """ out = [] fname = utils.get_in(x, key) upload_key = list(key) upload_key[-1] = "do_upload" do_upload = tz.get_in(tuple(upload_key), x, True) if fname and do_upload: if fname.endswith(".vcf.gz"): out.append({ "path": fname, "type": "vcf.gz", "ext": x["variantcaller"], "variantcaller": x["variantcaller"] }) if utils.file_exists(fname + ".tbi"): out.append({ "path": fname + ".tbi", "type": "vcf.gz.tbi", "index": True, "ext": x["variantcaller"], "variantcaller": x["variantcaller"] }) elif fname.endswith((".vcf", ".bed", ".bedpe", ".bedgraph", ".cnr", ".cns", ".cnn", ".txt")): ftype = utils.splitext_plus(fname)[-1][1:] if ftype == "txt": ftype = fname.split("-")[-1] out.append({ "path": fname, "type": ftype, "ext": x["variantcaller"], "variantcaller": x["variantcaller"] }) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) == "bwa" for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "lumpy")) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = _find_existing_inputs(data["align_bam"]) if not dedup_bam: dedup_bam, sr_bam, disc_bam = _extract_split_and_discordants(data["align_bam"], work_dir, data) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) pebed_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) out = [] sample_config_file = _write_samples_to_ids(pebed_file, items) lumpy_vcf = _bedpe_to_vcf(pebed_file, sample_config_file, items) for i, data in enumerate(items): if "sv" not in data: data["sv"] = [] sample = tz.get_in(["rgnames", "sample"], data) sample_bedpe = _filter_by_support(_subset_to_sample(pebed_file, i, data), i) if lumpy_vcf: sample_vcf = utils.append_stem(lumpy_vcf, "-%s" % sample) sample_vcf = _filter_by_bedpe(vcfutils.select_sample(lumpy_vcf, sample, sample_vcf, data["config"]), sample_bedpe, data) else: sample_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": sample_vcf, "bedpe_file": sample_bedpe, "sample_bed": sample_config_file}) out.append(data) return out
def _af_annotate_and_filter(paired, items, in_file, out_file): """Populating FORMAT/AF, and dropping variants with AF<min_allele_fraction Strelka2 doesn't report exact AF for a variant, however it can be calculated as alt_counts/dp from existing fields: somatic snps: GT:DP:FDP:SDP:SUBDP:AU:CU:GU:TU dp=DP {ALT}U[0] = alt_counts(tier1,tier2) indels: GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50 dp=DP TIR = alt_counts(tier1,tier2) germline snps: GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts indels: GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL(:PS) dp=sum(alt_counts) AD = ref_count,alt_counts """ data = paired.tumor_data if paired else items[0] min_freq = float( utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0 logger.debug( "Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq) ungz_out_file = "%s.vcf" % utils.splitext_plus(out_file)[0] if not utils.file_exists(ungz_out_file) and not utils.file_exists( ungz_out_file + ".gz"): with file_transaction(data, ungz_out_file) as tx_out_file: vcf = cyvcf2.VCF(in_file) vcf.add_format_to_header({ 'ID': 'AF', 'Description': 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), ' 'TIR/DPI (somatic indels)', 'Type': 'Float', 'Number': '.' }) vcf.add_filter_to_header({ 'ID': 'MinAF', 'Description': 'Allele frequency is lower than %s%% ' % (min_freq * 100) + ('(configured in bcbio as min_allele_fraction)' if utils.get_in(data["config"], ("algorithm", "min_allele_fraction")) else '(default threshold in bcbio; override with min_allele_fraction in the algorithm section)' ) }) w = cyvcf2.Writer(tx_out_file, vcf) tumor_index = vcf.samples.index(data['description']) for rec in vcf: if paired: # somatic? if rec.is_snp: # snps? alt_counts = rec.format( rec.ALT[0] + 'U')[:, 0] # {ALT}U=tier1_depth,tier2_depth else: # indels alt_counts = rec.format( 'TIR')[:, 0] # TIR=tier1_depth,tier2_depth dp = rec.format('DP')[:, 0] elif rec.format("AD") is not None: # germline? alt_counts = rec.format('AD')[:, 1:] # AD=REF,ALT1,ALT2,... dp = np.sum(rec.format('AD')[:, 0:], axis=1)[:, None] else: # germline gVCF record alt_counts, dp = (None, None) if dp is not None: with np.errstate( divide='ignore', invalid='ignore' ): # ignore division by zero and put AF=.0 af = np.true_divide(alt_counts, dp) af[~np.isfinite(af)] = .0 # -inf inf NaN -> .0 rec.set_format('AF', af) if paired and np.all(af[tumor_index] < min_freq): vcfutils.cyvcf_add_filter(rec, 'MinAF') w.write_record(rec) w.close() return vcfutils.bgzip_and_index(ungz_out_file, data["config"])
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _get_sort_order(in_bam, config): with open_samfile(in_bam) as bam_handle: header = bam_handle.header return utils.get_in(header, ("HD", "SO"), None)
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _is_trim_set(samples): for sample in dd.sample_data_iterator(samples): return utils.get_in(sample, ["algorithm", "trim_reads"]) return None
def _get_strandedness(config): return get_in(config, ("algorithm", "strandedness"), "unstranded").lower()
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'" varscan_cmd = ("varscan {jvm_opts} somatic " " <({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ("cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | " "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in itertools.izip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts = (" ".join( _vardict_options_from_config(items, config, out_file, target)) if _is_bed_file(target) else "") vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _set_fusion_mode(options, config): fusion_mode = get_in(config, ("algorithm", "fusion_mode"), False) if fusion_mode: options["fusion-search"] = True return options
def mutect2_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's MuTect2. This requires the full non open-source version of GATK 3.5+. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): paired = vcfutils.get_paired_bams(align_bams, items) broad_runner = broad.runner_from_config(items[0]["config"]) gatk_type = broad_runner.gatk_type() f1r2_file = None _prep_inputs(align_bams, ref_file, items) with file_transaction(items[0], out_file) as tx_out_file: params = [ "-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--reference", ref_file] else: params += ["-R", ref_file] for a in annotation.get_gatk_annotations( items[0]["config"], include_baseqranksum=False): params += ["--annotation", a] # Avoid issues with BAM CIGAR reads that GATK doesn't like if gatk_type == "gatk4": params += ["--read-validation-stringency", "LENIENT"] params += _add_tumor_params(paired, items, gatk_type) params += _add_region_params(region, out_file, items, gatk_type) if all(is_paired(bam) for bam in align_bams) and ("mutect2_readmodel" in utils.get_in( items[0], "config", "tools_on")): orientation_filter = True else: orientation_filter = False if gatk_type == "gatk4" and orientation_filter: f1r2_file = "{}-f1r2.tar.gz".format( utils.splitext_plus(out_file)[0]) params += ["--f1r2-tar-gz", f1r2_file] # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm # Not yet clear how this helps or hurts in a general case. #params += _add_assoc_params(assoc_files) resources = config_utils.get_resources("mutect2", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \ "Require full version of GATK 3.5+ for mutect2 calling" broad_runner.new_resources("mutect2") gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file)) if gatk_type == "gatk4": tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus( out_file) tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus( tx_out_file) if orientation_filter: tx_f1r2_file = "{}-read-orientation-model.tar.gz" tx_f1r2_file = tx_f1r2_file.format( utils.splitext_plus(f1r2_file)[0]) tx_read_orient_cmd = _mutect2_read_filter( broad_runner, f1r2_file, tx_f1r2_file) filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file, tx_f1r2_file) else: filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file) if orientation_filter: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {tx_read_orient_cmd} && {filter_cmd}" else: cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}" else: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) cmd = "{gatk_cmd} > {tx_raw_file}" do.run(cmd.format(**locals()), "MuTect2") out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, config) options = _set_quality_flag(options, config) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, config) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError( "Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "%s.sam" % out_base) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.sam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist( fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-convert-bam"] = True options["no-coverage-search"] = True options["no-mixed"] = True tophat_runner = sh.Command( config_utils.get_program("tophat", config)) ready_options = {} for k, v in options.iteritems(): ready_options[k.replace("-", "_")] = v # tophat requires options before arguments, # otherwise it silently ignores them tophat_ready = tophat_runner.bake(**ready_options) cmd = str(tophat_ready.bake(*files)) do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file), None) _fix_empty_readnames(out_file) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.sam" % out_base), ref_file, config) else: fixed = out_file fixed = merge_unmapped(fixed, unmapped, config) fixed = _fix_unmapped(fixed, config, names) fixed = bam.sort(fixed, config) fixed = bam.bam_to_sam(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out
def _get_output_dir(align_file, data, sample_dir=True): config = data["config"] name = data["rgnames"]["sample"] if sample_dir else "" return os.path.join(get_in(data, ("dirs", "work")), "cufflinks", name)
def _get_sv_exclude_file(items): """Retrieve SV file of regions to exclude. """ sv_bed = utils.get_in(items[0], ("genome_resources", "variation", "sv_repeat")) if sv_bed and os.path.exists(sv_bed): return sv_bed
def get_aligner(x, config): return utils.get_in(config, ("algorithm", "aligner"), "")
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), sr_bam, exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), sr_bam, exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out