def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError( "Only regional variant calling with cortex_var is supported. Set variant_regions" ) target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [ _run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, out_file, config) for x in in_handle ] combine_variant_files(regional_vcfs, out_file, ref_file, config) else: write_empty_vcf(out_file) return out_file
def _combine_variants(in_vcfs, out_file, ref_file, config): """Combine variant files, batching to avoid problematic large commandlines. """ max_batch = 500 if len(in_vcfs) > max_batch: new_vcfs = [] for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)): path, fname = os.path.split(out_file) batch_path = safe_makedir(os.path.join(path, "batch")) base, ext = os.path.splitext(fname) cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext)) for x in batch_vcfs: with open(x) as in_handle: if not in_handle.readline().startswith("##fileformat=VCFv4"): raise ValueError("Unexpected VCF file: %s" % x) combine_variant_files(batch_vcfs, cur_out, ref_file, config) new_vcfs.append(cur_out) in_vcfs = new_vcfs assert len(in_vcfs) <= max_batch combine_variant_files(in_vcfs, out_file, ref_file, config)
def _combine_variants(in_vcfs, out_file, ref_file, config): """Combine variant files, batching to avoid problematic large commandlines. """ max_batch = 500 if len(in_vcfs) > max_batch: new_vcfs = [] for i, batch_vcfs in enumerate(partition_all(max_batch, in_vcfs)): path, fname = os.path.split(out_file) batch_path = safe_makedir(os.path.join(path, "batch")) base, ext = os.path.splitext(fname) cur_out = os.path.join(batch_path, "{0}-batch{1}{2}".format(base, i, ext)) for x in batch_vcfs: with open(x) as in_handle: if not in_handle.readline().startswith( "##fileformat=VCFv4"): raise ValueError("Unexpected VCF file: %s" % x) combine_variant_files(batch_vcfs, cur_out, ref_file, config) new_vcfs.append(cur_out) in_vcfs = new_vcfs assert len(in_vcfs) <= max_batch combine_variant_files(in_vcfs, out_file, ref_file, config)
def run_cortex(align_bam, ref_file, config, dbsnp=None, region=None, out_file=None): """Top level entry to regional de-novo based variant calling with cortex_var. """ broad_runner = broad.runner_from_config(config) if out_file is None: out_file = "%s-cortex.vcf" % os.path.splitext(align_bam)[0] if not file_exists(out_file): broad_runner.run_fn("picard_index", align_bam) variant_regions = config["algorithm"].get("variant_regions", None) if not variant_regions: raise ValueError("Only regional variant calling with cortex_var is supported. Set variant_regions") target_regions = subset_variant_regions(variant_regions, region, out_file) if os.path.isfile(target_regions): with open(target_regions) as in_handle: regional_vcfs = [_run_cortex_on_region(x.strip().split("\t")[:3], align_bam, ref_file, out_file, config) for x in in_handle] combine_variant_files(regional_vcfs, out_file, ref_file, config) else: write_empty_vcf(out_file) return out_file
def prep_gemini_db(fnames, call_id, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") if not utils.file_exists(gemini_db): if len(fnames) > 1: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = genotype.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) subprocess.check_call(cmd, shell=True) return [[call_id, gemini_db]]
def prep_gemini_db(fnames, call_id, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") if not utils.file_exists(gemini_db): if len(fnames) > 1: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = genotype.combine_variant_files( fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) subprocess.check_call(cmd, shell=True) return [[call_id, gemini_db]]
def combine_variant_files(*args): return genotype.combine_variant_files(*args)
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [ genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config) ] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")