def _make_dir(dir, label): if not os.path.exists(dir): if not options.dry_run: os.makedirs(dir) logger.info("Creating %s directory %s" % (label, dir)) else: logger.warn("%s already exists: not creating new directory" % (dir))
def _get_machine_info(parallel, sys_config, dirs, config): """Get machine resource information from the job scheduler via either the command line or the queue. """ if parallel.get("queue") and parallel.get("scheduler"): # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys sched_info_dict = { "slurm": _slurm_info, "torque": _torque_info, "sge": _sge_info } try: return sched_info_dict[parallel["scheduler"].lower()]( parallel["queue"]) except KeyError: logger.info( "Resource query function not implemented for scheduler \"{0}\"; " "submitting job to queue".format(parallel["scheduler"])) except: # If something goes wrong, just hit the queue logger.warn( "Couldn't get machine information from resource query function for queue " "'{0}' on scheduler \"{1}\"; " "submitting job to queue".format(parallel["queue"], parallel["scheduler"])) from bcbio.distributed import prun with prun.start(parallel, [[sys_config]], config, dirs) as run_parallel: return run_parallel("machine_info", [[sys_config]])
def run_vcfanno(vcf, anno_type, data): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ UNSUPPORTED_TYPE_MESSAGE = ( "{anno_type} is not a supported vcf annotation type with vcfanno. " "Supported types are {SUPPORTED_ANNOTATION_TYPES}") if anno_type not in SUPPORTED_ANNOTATION_TYPES: logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals())) return vcf build = dd.get_genome_build(data) annodir = os.path.dirname(dd.get_ref_file(data)) annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno")) annostem = os.path.join(annodir, build + "-") conffn = annostem + anno_type + ".conf" luafn = annostem + anno_type + ".lua" CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf base = os.path.splitext(vcf)[0] out_file = base + anno_type + "-annotated.vcf.gz" if utils.file_exists(out_file): return out_file basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.path.pardir)) basepath = annodir out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn) return out_file
def run_vcfanno(vcf, anno_type, data): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ UNSUPPORTED_TYPE_MESSAGE = ( "{anno_type} is not a supported vcf annotation type with vcfanno. " "Supported types are {SUPPORTED_ANNOTATION_TYPES}") if anno_type not in SUPPORTED_ANNOTATION_TYPES: logger.warn(UNSUPPORTED_TYPE_MESSAGE.format(**locals())) return vcf build = dd.get_genome_build(data) annodir = os.path.dirname(dd.get_ref_file(data)) annodir = os.path.abspath(os.path.join(annodir, os.pardir, "vcfanno")) annostem = os.path.join(annodir, build + "-") conffn = annostem + anno_type + ".conf" luafn = annostem + anno_type + ".lua" CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf base = os.path.splitext(vcf)[0] out_file = base + anno_type + "-annotated.vcf.gz" if utils.file_exists(out_file): return out_file basepath = os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.path.pardir)) basepath = annodir out_file = vcfanno(vcf, out_file, conffn, data, basepath, luafn) return out_file
def calling(data): """Main function to parallelize peak calling.""" method = dd.get_chip_method(data) caller_fn = get_callers()[data["peak_fn"]] if method == "chip": chip_bam = data.get("work_bam") input_bam = data.get("work_bam_input", None) name = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) greylistdir = greylisting(data) data.update({"peaks_files": out_files}) if greylistdir: data["greylist"] = greylistdir if method == "atac": fractions = list(ATACRanges.keys()) + ["full"] for fraction in fractions: MIN_READS_TO_CALL = 1000 chip_bam = tz.get_in(("atac", "align", fraction), data) if not bam.has_nalignments(chip_bam, MIN_READS_TO_CALL, data): logger.warn(f"{chip_bam} has less than {MIN_READS_TO_CALL}, peak calling will fail so skip this fraction.") continue logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.") name = dd.get_sample_name(data) + f"-{fraction}" out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name)) out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir, dd.get_chip_method(data), data["resources"], data) data = tz.assoc_in(data, ("peaks_files", fraction), out_files) return [[data]]
def find_annotations(data): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations if not specified: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = _default_conf_files(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] out = [] annodir = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))) for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file else: conffn = os.path.join(annodir, conf_file + ".conf") if not utils.file_exists(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if os.path.exists(luafn): out.append(luafn) return out
def run(items, background=None): """Detect copy number variations from tumor/normal samples using Battenberg. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if not paired or not paired.normal_bam: logger.warn( "Battenberg only works on paired tumor/normal inputs, skipping %s" % dd.get_sample_name(items[0])) batout = None elif not tz.get_in(["genome_resources", "aliases", "human"], paired.tumor_data): logger.warn("Battenberg only works on human data, skipping %s" % dd.get_sample_name(items[0])) batout = None else: batout = _do_run(paired) batout["variantcaller"] = "battenberg" out = [] for data in items: if batout: if "sv" not in data: data["sv"] = [] data["sv"].append(batout) out.append(data) return out
def _make_dir(dir): if options.dry_run: return if not os.path.exists(dir): os.makedirs(dir) logger.info("Creating delivery directory %s" % (dir)) else: logger.warn("%s already exists: not creating new directory" % (dir))
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [ (x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"]) ] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file]( data, retriever): logger.warn( "Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def run(vrn_info, cnvs_by_name, somatic_info): """Run PhyloWGS given variant calls, CNVs and tumor/normal information. """ config = {"sample_size": 5000} work_dir = _cur_workdir(somatic_info.tumor_data) if "battenberg" not in cnvs_by_name: logger.warn("PhyloWGS requires Battenberg CNV calls, skipping %s" % dd.get_sample_name(somatic_info.tumor_data)) else: ssm_file, cnv_file = _prep_inputs(vrn_info, cnvs_by_name["battenberg"], somatic_info, work_dir, config) evolve_file = _run_evolve(ssm_file, cnv_file, work_dir, somatic_info.tumor_data) print evolve_file, ssm_file, cnv_file
def run(vrn_info, cnvs_by_name, somatic_info): """Run PhyloWGS given variant calls, CNVs and tumor/normal information. """ config = {"sample_size": 5000} work_dir = _cur_workdir(somatic_info.tumor_data) if "battenberg" not in cnvs_by_name: logger.warn("PhyloWGS requires Battenberg CNV calls, skipping %s" % dd.get_sample_name(somatic_info.tumor_data)) else: ssm_file, cnv_file = _prep_inputs(vrn_info, cnvs_by_name["battenberg"], somatic_info, work_dir, config) evolve_file = _run_evolve(ssm_file, cnv_file, work_dir, somatic_info.tumor_data) summary_file = _prepare_summary(evolve_file, ssm_file, cnv_file, work_dir, somatic_info) print(summary_file)
def run(items, background=None): """Detect copy number variations from batched set of samples using GATK4 CNV calling. TODO: implement germline calling with DetermineGermlineContigPloidy and GermlineCNVCaller """ if not background: background = [] paired = vcfutils.get_paired(items + background) if paired: out = _run_paired(paired) else: out = items logger.warn("GATK4 CNV calling currently only available for somatic samples: %s" % ", ".join([dd.get_sample_name(d) for d in items + background])) return out
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel(samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start(_wres(parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) with profile.report("bcbioRNAseq loading", dirs): tools_on = dd.get_in_samples(samples, dd.get_tools_on) bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on if bcbiornaseq_on: if len(samples) < 3: logger.warn("bcbioRNASeq needs at least three samples total, skipping.") elif len(samples) > 100: logger.warn("Over 100 samples, skipping bcbioRNASeq.") else: run_parallel("run_bcbiornaseqload", [sample]) logger.info("Timing: finished") return samples
def get_version(config): try: pdir = config_utils.get_program(program_name, config, "dir") # not configured except ValueError: return "" jar = os.path.basename(config_utils.get_jar(jar_name, pdir)) for to_remove in [jar_name, ".jar", "-standalone"]: jar = jar.replace(to_remove, "") if jar.startswith(("-", ".")): jar = jar[1:] if not jar: logger.warn("Unable to determine version for program '{}' from jar file {}".format( program_name, config_utils.get_jar(jar_name, pdir))) return jar
def process_lane(lane_items, fc_name, fc_date, dirs, config): """Prepare lanes, potentially splitting based on barcodes. """ lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name) full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"], lane_items[0], fc_name, config=config) # Filter phiX custom_config = _update_config_w_custom(config, lane_items[0]) if custom_config["algorithm"].get("filter_phix", False): # If we are starting from demultiplexed material, we will skip a lane-wise screening # Screening will be performed on a sample basis if custom_config["algorithm"].get("demultiplexed",False): logger.warn("Will not filter phix lane-wise on already demultiplexed files. You will have to specify genomes_filter_out option for each sample") else: logger.info("Filtering phiX from %s" % lane_name) info = {"genomes_filter_out": "spiked_phix", "description": lane_name} processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config) (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4] logger.info("Demultiplexing %s" % lane_name) bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config) out = [] for item in lane_items: config = _update_config_w_custom(config, item) # Can specify all barcodes but might not have actual sequences # Would be nice to have a good way to check this is okay here. if item["barcode_id"] in bc_files: fastq1, fastq2 = bc_files[item["barcode_id"]] cur_lane_name = lane_name cur_lane_desc = item["description"] if item.get("name", "") and config["algorithm"].get("include_short_name", True): cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc) if item["barcode_id"] is not None: cur_lane_name += "_%s" % (item["barcode_id"]) if config["algorithm"].get("trim_reads", False): trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None], dirs, config) fastq1 = trim_info[0] if fastq2 is not None: fastq2 = trim_info[1] out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config)) return out
def find_annotations(data, retriever=None): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data, retriever): if c not in conf_files: conf_files.append(c) conf_checkers = {"gemini": annotate_gemini, "somatic": _annotate_somatic} out = [] annodir = os.path.normpath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno")) if not retriever: annodir = os.path.abspath(annodir) for conf_file in conf_files: if objectstore.is_remote(conf_file) or (os.path.exists(conf_file) and os.path.isfile(conf_file)): conffn = conf_file elif not retriever: conffn = os.path.join(annodir, conf_file + ".conf") else: conffn = conf_file + ".conf" luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if retriever: conffn, luafn = [(x if objectstore.is_remote(x) else None) for x in retriever.add_remotes([conffn, luafn], data["config"])] if not conffn: pass elif conf_file in conf_checkers and not conf_checkers[conf_file](data, retriever): logger.warn("Skipping vcfanno configuration: %s. Not all input files found." % conf_file) elif not objectstore.file_exists_or_remote(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) if luafn and objectstore.file_exists_or_remote(luafn): out.append(luafn) return out
def remove_multimappers(bam_file, data): aligner = dd.get_aligner(data) if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(bam_file, data) else: unique_bam = bam_file logger.warn( "When a BAM file is given as input, bcbio skips removal of " "multimappers.") return unique_bam
def _deliver_file(src, tgt): if options.move: deliver_fn = shutil.move else: deliver_fn = shutil.copyfile if src is None: return if not os.path.exists(src): return if os.path.exists(tgt): logger.warn("%s already exists: not doing anything!" % (tgt)) return if options.dry_run: print "DRY_RUN: %s file %s to %s" % (deliver_fn.__name__, src, tgt) else: logger.info("%s file %s to %s" % (deliver_fn.__name__, src, tgt)) f(src, tgt)
def _handle_data(src, tgt, f=shutil.copyfile, f2=None): if options.only_run_info: return if src is None: return if os.path.exists(tgt): logger.warn("%s already exists: not doing anything!" % (tgt)) return if options.dry_run: print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt) if not f2 is None: print "DRY_RUN: %s file %s to %s" % (f2.__name__, tgt, src) else: logger.info("%s file %s to %s" % (f.__name__, src, tgt)) f(src, tgt) if not f2 is None: logger.info("%s file %s to %s" % (f2.__name__, tgt, src)) f2(tgt, src)
def _handle_data(src, tgt, f=shutil.copyfile, f2=None): if options.only_run_info: return if src is None: return if os.path.exists(tgt): logger.warn("%s already exists: not doing anything!" %(tgt)) return if options.dry_run: print "DRY_RUN: %s file %s to %s" % (f.__name__, src, tgt) if not f2 is None: print "DRY_RUN: %s file %s to %s" % (f2.__name__, tgt, src) else: logger.info("%s file %s to %s" % (f.__name__, src, tgt)) f(src, tgt) if not f2 is None: logger.info("%s file %s to %s" % (f2.__name__, tgt, src)) f2(tgt, src)
def run_vcfanno(vcf, conf_files, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] build = dd.get_genome_build(data) basepath = os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conf_fns = [] lua_fns = [] anno_type = None for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file luafn = "%s.lua" % utils.splitext_plus(conffn)[0] else: anno_type = os.path.basename(conf_file) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") if not utils.file_exists(conffn): CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: conf_fns.append(conffn) lua_fns.append(luafn) if not conf_fns: return vcf if not anno_type: anno_type = "gemini" out_file = utils.splitext_plus( vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if utils.file_exists(out_file): return out_file out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns) return out_file
def _get_machine_info(parallel, run_parallel, sys_config): """Get machine resource information from the job scheduler via either the command-line or the queue. """ if parallel.get("queue") and parallel.get("scheduler"): # dictionary as switch statement; can add new scheduler implementation functions as (lowercase) keys sched_info_dict = { "slurm": _slurm_info, } try: return sched_info_dict[parallel["scheduler"].lower()](parallel["queue"]) except KeyError: logger.info("Resource query function not implemented for scheduler \"{0}\"; " "submitting job to queue".format(parallel["scheduler"])) except: # If something goes wrong, just hit the queue logger.warn("Couldn't get machine information from resource query function for queue " "'{0}' on scheduler \"{1}\"; " "submitting job to queue".format(parallel["queue"], parallel["scheduler"])) return run_parallel("machine_info", [[sys_config]])
def run(items, background=None): """Detect copy number variations from tumor/normal samples using Battenberg. """ paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) if not paired or not paired.normal_bam: logger.warn("Battenberg only works on paired tumor/normal inputs, skipping %s" % dd.get_sample_name(items[0])) batout = None elif not tz.get_in(["genome_resources", "aliases", "human"], paired.tumor_data): logger.warn("Battenberg only works on human data, skipping %s" % dd.get_sample_name(items[0])) batout = None else: batout = _do_run(paired) batout["variantcaller"] = "battenberg" out = [] for data in items: if batout and dd.get_sample_name(data) == paired.tumor_name: if "sv" not in data: data["sv"] = [] data["sv"].append(batout) out.append(data) return out
def make_quality_report(data): """ create and render the bcbioRNASeq quality report """ MAX_SAMPLES = 100 if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") nsamples = len(list(Path(upload_dir).rglob('quant.sf'))) groups = dd.get_bcbiornaseq(data).get("interesting_groups", None) safe_makedir(report_dir) quality_rmd = os.path.join(report_dir, "quality_control.Rmd") quality_html = os.path.join(report_dir, "quality_control.html") quality_rmd = rmarkdown_draft(quality_rmd, "01-quality-control", "bcbioRNASeq") if nsamples > MAX_SAMPLES and not groups: logger.warn( f"{nsamples} detected, disabling a few bcbioRNASeq plots which break " f"with many samples. Setting `interesting_groups` would allow these plots " f"to be created.") quality_rmd = many_samples_workaround(quality_rmd) if not file_exists(quality_html): render_rmarkdown_file(quality_rmd) return data
def run_vcfanno(vcf, conf_files, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] build = dd.get_genome_build(data) basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conf_fns = [] lua_fns = [] anno_type = None for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file luafn = "%s.lua" % utils.splitext_plus(conffn)[0] else: anno_type = os.path.basename(conf_file) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") if not utils.file_exists(conffn): CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") logger.warn(CONF_NOT_FOUND.format(**locals())) else: conf_fns.append(conffn) lua_fns.append(luafn) if conf_fns: if not anno_type: anno_type = "gemini" out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if not utils.file_exists(out_file): out_file = vcfanno(vcf, out_file, conf_fns, data, data_basepath or basepath, lua_fns) return vcfutils.bgzip_and_index(out_file, data["config"])
def run_vcfanno(vcf, anno_type, data, data_basepath=None): """ annotated a VCF file using vcfanno, looks up the proper config/lua scripts under the `vcfanno` key under the algorithm section of the datadict, skipping if the files cannot be found """ build = dd.get_genome_build(data) basepath = os.path.abspath(os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir)) annodir = os.path.abspath(os.path.join(basepath, "config", "vcfanno")) conffn = os.path.join(annodir, anno_type + ".conf") luafn = os.path.join(annodir, anno_type + ".lua") CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping.") if not utils.file_exists(conffn): logger.warn(CONF_NOT_FOUND.format(**locals())) return vcf out_file = utils.splitext_plus(vcf)[0] + "-annotated-" + anno_type + ".vcf.gz" if utils.file_exists(out_file): return out_file out_file = vcfanno(vcf, out_file, conffn, data, data_basepath or basepath, luafn) return out_file
def _check_mutect_version(broad_runner): mutect_version = broad_runner.get_mutect_version() try: assert mutect_version is not None except AssertionError: logger.warn("WARNING") logger.warn("MuTect version could not be determined from jar file. " "Please ensure you are using at least version 1.1.5, " "as versions 1.1.4 and lower have known issues.") logger.warn("Proceeding but assuming correct version 1.1.5.") else: try: assert LooseVersion(mutect_version) >= LooseVersion("1.1.5") except AssertionError: message = ("MuTect 1.1.4 and lower is known to have incompatibilities " "with Java < 7, and this may lead to problems in analyses. " "Please use MuTect 1.1.5 or higher (note that it requires " "Java 7).") raise ValueError(message)
def main(run,project_id,sample_names,config_file,Map_Stat,Read_Dist,FPKM,rRNA_table): TEMPLATE="""\ RNA-seq analysis report for ${project_id} ================================= ${latex_opt} Summary ------------------------- **Project name:** ${project_id} (UPPMAX project ${uppnex}) **Samples:** ${samplenames} **Run name:** ${runname} **Mapping:** ${mapping} **Duplicate removal:** ${dup_rem} **Read count:** ${read_count} **RPKM/FPKM values:** ${quantifyer} **Result directories on UPPMAX:** /proj/${uppnex}/INBOX/${project_id}/analysis/alignments (BAM files), /proj/${uppnex}/INBOX/${project_id}/analysis/quantification (FPKM files) .. raw:: latex \clearpage Results -------------------------""" if Map_Stat: TEMPLATE=TEMPLATE+""" Mapping statistics ^^^^^^^^^^^^^^^ ${Mapping_statistics} Comments ~~~~~~~~ **tot # read pairs:** The total number of read pairs indicates the total number of sequenced paired-end reads. Since a paired-end read is made up of two sequenced fragments (mates), the total number of sequenced 100-bp regions is twice the number shown in this column. **% mapped reads:** The number of fragments that are mapped relative to the total number of sequenced fragments. **% reads left after dup rem:** We remove duplicate reads (paired end reads where both mates map to the same loci as both mates in a different paired-end read because these are likely to be artifacts caused by PCR amplification or over-sequencing. Aligned files in BAM format with duplicates removed can be found in /proj/${uppnex}/INBOX/${project_id}/analysis/alignments. .. raw:: latex \clearpage """ TEMPLATE=TEMPLATE+""" Expression values ^^^^^^^^^^^^^^^^^ The /proj/${uppnex}/INBOX/${project_id}/analysis/quantification folder contains FPKM values calculated using the Cufflinks program using ENSEMBL annotation of genes and transcripts for each sample. These files also contain the upper and lower limits of the confidence interval for the FPKM estimate. FPKM values are the paired-end equivalent of RPKM (Reads Per Kilobase per Million mapped reads; the standard measure for gene expression in RNA-seq.) There is also a single fpkm_table.txt file, which contains all of the FPKM values. This can be opened in Excel or a regular text processing application. For analyzing differential expression of genes or transcripts, it may be useful to have the raw read counts (the number of sequences that map to each gene/transcript) as well. These are calculated using the HTSeq software and are collected into a table called count_table.txt. .. raw:: latex \clearpage """ if FPKM: TEMPLATE=TEMPLATE+""" FPKM heatmap ^^^^^^^^^^^^^^^^^ This heatmap shows the (Pearson) correlation between FPKM values of samples. ${FPKM_heatmap} .. raw:: latex \clearpage FPKM PCA ^^^^^^^^^^^^^^^^^ This PCA (principal component analysis) score plot has the samples plotted according to their scores for the two principal components that explain the largest amount of variance in the FPKM data table. The number after 'expl var' in the axis labels tells you how much of the variance that is explained by each component. Similar samples should, in theory, cluster together in the PCA plot. PCA is a way to compress the information in your high-dimensional data matrix so that it can be plotted in two dimensions. ${FPKM_PCAplot} .. raw:: latex \clearpage """ if Read_Dist: TEMPLATE=TEMPLATE+""" Read distribution ^^^^^^^^^^^^^^^^^ This table contain information about the extent to which sequences from each sample mapped to different structural parts of genes, like coding exons, untranslated regions, and transcription start sites. The actual number itself is less important than the relative values for the different kinds of regions. For a normal RNA-seq experiment you should have a higher value in the CDS Exon column than in the others, for example. "CDS Exon" means "coding sequence exons", "UTR" stands for "untranslated region", "TES" stands for "transcription end site", "TSS" stands for "transcription start site". "Intronic regions" should be interpreted as "intronic or intergenic regions". Perhaps the most easily interpretable column is the final column, mRNA fraction, which gives the fraction [0-1] of sequences that mapped to ENSEMBL-annotated mRNA (including coding regions and UTRs). While this fraction is not completely accurate (because ENSEMBL doe not completely describe the transcriptome), it is a useful summary statistic which should be relatively high for an mRNA-seq experiment, typically above 0.8. ${Read_Distribution} .. raw:: latex \clearpage """ if rRNA_table: TEMPLATE=TEMPLATE+""" Quantification of rRNA present in the samples ^^^^^^^^^^^^^^^^^^^^^ ${rRNA_table} .. raw:: latex \clearpage """ sphinx_defs = [] if config_file: config = load_config(config_file) else: config = {} sphinx_defs.append("('%s', '%s_analysis.tex', 'RNA-seq Analysis Report', u'SciLifeLab Stockholm', 'howto'),\n" % (project_id, project_id)) projectfile = "%s.mako" % (project_id) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() mylookup = TemplateLookup(directories=['./']) tmpl = Template(filename=projectfile, lookup=mylookup) proj_conf = { 'id' : project_id, 'run':run, 'config' : config, 'samples': sample_names.split(',') } d = generate_report(proj_conf) rstfile = "%s.rst" % (project_id) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() sphinxconf = os.path.join(os.getcwd(), "conf.py") if not os.path.exists(sphinxconf): logger.warn("no sphinx configuration file conf.py found: you have to edit conf.py yourself!") else: fp = open(sphinxconf) lines = fp.readlines() fp.close() sdout = [] modify_conf = False for sd in sphinx_defs: if not sd in lines: sdout.append(sd) modify_conf = True if modify_conf: i = lines.index("latex_documents = [\n") newconf = lines[:i+3] + sdout + lines[i+3:] fp = open("conf.py", "w") fp.write("".join(newconf)) fp.close()
def main(run, project_id, sample_names, single_end, config_file, Map_Stat, Read_Dist, FPKM, rRNA_table): TEMPLATE = """\ RNA-seq analysis report for ${project_id} ================================= ${latex_opt} Summary ------------------------- **Project name:** ${project_id} (UPPMAX project ${uppnex}) **Samples:** ${samplenames} **Run name:** ${runname} **Mapping:** ${mapping} **Duplicate removal:** ${dup_rem} **Read count:** ${read_count} **RPKM/FPKM values:** ${quantifyer} **Result directories on UPPMAX:** /proj/${uppnex}/INBOX/${project_id}/analysis/alignments (BAM files), /proj/${uppnex}/INBOX/${project_id}/analysis/quantification (FPKM files) .. raw:: latex \clearpage Results -------------------------""" if Map_Stat: TEMPLATE = TEMPLATE + """ Mapping statistics ^^^^^^^^^^^^^^^ ${Mapping_statistics} Comments ~~~~~~~~ **Tot # reads:** If paired-end reads, the total number of reads indicates the total number of sequenced paired-end reads. Since a paired-end read is made up of two sequenced fragments (mates), the total number of sequenced 100-bp regions is twice the number shown in this column. If single-end reads, this column reflects the total numer of sequences. **% mapped reads:** The number of fragments that are mapped relative to the total number of sequenced fragments. **% reads left after dup rem:** We remove duplicate reads (if paired-end, duplicates are definded as the paired end reads where both mates map to the same loci as both mates in a different paired-end read) because these are likely to be artifacts caused by PCR amplification or over-sequencing. Aligned files in BAM format with duplicates removed can be found in /proj/${uppnex}/INBOX/${project_id}/analysis/alignments. .. raw:: latex \clearpage """ TEMPLATE = TEMPLATE + """ Expression values ^^^^^^^^^^^^^^^^^ The /proj/${uppnex}/INBOX/${project_id}/analysis/quantification folder contains FPKM values calculated using the Cufflinks program using ENSEMBL annotation of genes and transcripts for each sample. These files also contain the upper and lower limits of the confidence interval for the FPKM estimate. FPKM values are the paired-end equivalent of RPKM (Reads Per Kilobase per Million mapped reads; the standard measure for gene expression in RNA-seq.) There is also a single fpkm_table.txt file, which contains all of the FPKM values. This can be opened in Excel or a regular text processing application. For analyzing differential expression of genes or transcripts, it may be useful to have the raw read counts (the number of sequences that map to each gene/transcript) as well. These are calculated using the HTSeq software and are collected into a table called count_table.txt. .. raw:: latex \clearpage """ if FPKM: TEMPLATE = TEMPLATE + """ FPKM heatmap ^^^^^^^^^^^^^^^^^ This heatmap shows the (Pearson) correlation between FPKM values of samples. ${FPKM_heatmap} .. raw:: latex \clearpage FPKM PCA ^^^^^^^^^^^^^^^^^ This PCA (principal component analysis) score plot has the samples plotted according to their scores for the two principal components that explain the largest amount of variance in the FPKM data table. The number after 'expl var' in the axis labels tells you how much of the variance that is explained by each component. Similar samples should, in theory, cluster together in the PCA plot. PCA is a way to compress the information in your high-dimensional data matrix so that it can be plotted in two dimensions. ${FPKM_PCAplot} .. raw:: latex \clearpage """ if Read_Dist: TEMPLATE = TEMPLATE + """ Read distribution ^^^^^^^^^^^^^^^^^ This table contain information about the extent to which sequences from each sample mapped to different structural parts of genes, like coding exons, untranslated regions, and transcription start sites. The actual number itself is less important than the relative values for the different kinds of regions. For a normal RNA-seq experiment you should have a higher value in the CDS Exon column than in the others, for example. "CDS Exon" means "coding sequence exons", "UTR" stands for "untranslated region", "TES" stands for "transcription end site", "TSS" stands for "transcription start site". "Intronic regions" should be interpreted as "intronic or intergenic regions". Perhaps the most easily interpretable column is the final column, mRNA fraction, which gives the fraction [0-1] of sequences that mapped to ENSEMBL-annotated mRNA (including coding regions and UTRs). While this fraction is not completely accurate (because ENSEMBL doe not completely describe the transcriptome), it is a useful summary statistic which should be relatively high for an mRNA-seq experiment, typically above 0.8. ${Read_Distribution} .. raw:: latex \clearpage """ if rRNA_table: TEMPLATE = TEMPLATE + """ Quantification of rRNA present in the samples ^^^^^^^^^^^^^^^^^^^^^ ${rRNA_table} .. raw:: latex \clearpage """ sphinx_defs = [] if config_file: config = load_config(config_file) else: config = {} sphinx_defs.append( "('%s', '%s_analysis.tex', 'RNA-seq Analysis Report', u'SciLifeLab Stockholm', 'howto'),\n" % (project_id, project_id)) projectfile = "%s.mako" % (project_id) fp = open(projectfile, "w") fp.write(TEMPLATE) fp.close() mylookup = TemplateLookup(directories=['./']) tmpl = Template(filename=projectfile, lookup=mylookup) proj_conf = { 'id': project_id, 'run': run, 'config': config, 'samples': sample_names.split(',') } d = generate_report(proj_conf, single_end) rstfile = "%s.rst" % (project_id) fp = open(rstfile, "w") fp.write(tmpl.render(**d)) fp.close() sphinxconf = os.path.join(os.getcwd(), "conf.py") if not os.path.exists(sphinxconf): logger.warn( "no sphinx configuration file conf.py found: you have to edit conf.py yourself!" ) else: fp = open(sphinxconf) lines = fp.readlines() fp.close() sdout = [] modify_conf = False for sd in sphinx_defs: if not sd in lines: sdout.append(sd) modify_conf = True if modify_conf: i = lines.index("latex_documents = [\n") newconf = lines[:i + 3] + sdout + lines[i + 3:] fp = open("conf.py", "w") fp.write("".join(newconf)) fp.close()
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") mutect_version = broad_runner.get_mutect_version() try: assert mutect_version is not None except AssertionError: logger.warn("WARNING") logger.warn("MuTect version could not be determined from jar file. " "Please ensure you are using at least version 1.1.5, " "as versions 1.1.4 and lower have known issues.") logger.warn("Proceeding but assuming correct version 1.1.5.") else: try: assert LooseVersion(mutect_version) >= LooseVersion("1.1.5") except AssertionError: message = ( "MuTect 1.1.4 and lower is known to have incompatibilities " "with Java < 7, and this may lead to problems in analyses. " "Please use MuTect 1.1.5 or higher (note that it requires " "Java 7).") raise ValueError(message) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += [ "-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION" ] return broad_runner, params
def _mutect_call_prep(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """ Preparation work for MuTect. """ #FIXME: We assume all other bits in the config are shared base_config = items[0]["config"] dbsnp = assoc_files["dbsnp"] cosmic = assoc_files.get("cosmic") broad_runner = broad.runner_from_config(base_config, "mutect") mutect_version = broad_runner.get_mutect_version() try: assert mutect_version is not None except AssertionError: logger.warn("WARNING") logger.warn("MuTect version could not be determined from jar file. " "Please ensure you are using at least version 1.1.5, " "as versions 1.1.4 and lower have known issues.") logger.warn("Proceeding but assuming correct version 1.1.5.") else: try: assert LooseVersion(mutect_version) >= LooseVersion("1.1.5") except AssertionError: message = ("MuTect 1.1.4 and lower is known to have incompatibilities " "with Java < 7, and this may lead to problems in analyses. " "Please use MuTect 1.1.5 or higher (note that it requires " "Java 7).") raise ValueError(message) broad_runner.run_fn("picard_index_ref", ref_file) for x in align_bams: bam.index(x, base_config) variant_regions = base_config["algorithm"].get("variant_regions", None) contamination = base_config["algorithm"].get("fraction_contamination", 0) region = subset_variant_regions(variant_regions, region, out_file) #FIXME: Add more parameters like fraction contamination etc params = ["-R", ref_file, "-T", "MuTect"] params += ["--dbsnp", dbsnp] tumor_bam = None normal_bam = None for bamfile, item in itertools.izip(align_bams, items): metadata = item["metadata"] if metadata["phenotype"] == "normal": normal_bam = bamfile normal_sample_name = item["name"][1] elif metadata["phenotype"] == "tumor": tumor_bam = bamfile tumor_sample_name = item["name"][1] if tumor_bam is None or normal_bam is None: raise ValueError("Missing phenotype definition (tumor or normal) " "in samples") params += ["-I:normal", normal_bam] params += ["-I:tumor", tumor_bam] params += ["--tumor_sample_name", tumor_sample_name] params += ["--normal_sample_name", normal_sample_name] params += ["--fraction_contamination", contamination] if cosmic is not None: params += ["--cosmic", cosmic] if region: params += ["-L", bamprep.region_to_gatk(region), "--interval_set_rule", "INTERSECTION"] return broad_runner, params