def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], data) elif sort_method: runner = broad.runner_from_config(config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file( fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") return [[data]]
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [ (x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"]) ] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file]( data, retriever): logger.warn( "Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = utils.to_single_data(data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [(x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"])] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever): logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def annotate_gemini(data, retriever=None): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) return all([ r.get(k) and objectstore.file_exists_or_remote(r[k]) for k in ["exac", "gnomad_exome"] ])
def _annotate_somatic(data, retriever=None): """Annotate somatic calls if we have cosmic data installed. """ if is_human(data): paired = vcfutils.get_paired([data]) if paired: r = dd.get_variation_resources(data) if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]): return True return False
def _annotate_somatic(data, retriever=None): """Annotate somatic calls if we have cosmic data installed. """ if is_human(data): paired = vcfutils.get_paired([data]) if paired: r = dd.get_variation_resources(data) if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]): return True return False
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def annotate_gemini(data, retriever=None): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) return all([r.get(k) and objectstore.file_exists_or_remote(r[k]) for k in ["exac", "gnomad_exome"]])