def run(self): if not self._check_pargs(["project"]): return if self.pargs.post_process: self.pargs.post_process = os.path.abspath(self.pargs.post_process) basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) if self.pargs.from_ssheet: [ samplesheet_csv_to_yaml(fn) for fn in find_samples(basedir, pattern="SampleSheet.csv$", **vars(self.pargs)) ] flist = find_samples(basedir, **vars(self.pargs)) # Add filtering on flowcell if necessary self._meta.pattern = ".*" flist = [x for x in flist if self._filter_fn(x)] if self.pargs.merged: ## Setup merged samples and append to flist if new list longer flist = setup_merged_samples(flist, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No sample configuration files found") return if len(flist) > 0 and not query_yes_no( "Going to start {} jobs... Are you sure you want to continue?".format(len(flist)), force=self.pargs.force ): return # Make absolutely sure analysis directory is a *subdirectory* of the working directory validate_sample_directories(flist, basedir) orig_dir = os.path.abspath(os.getcwd()) for run_info in flist: os.chdir(os.path.abspath(os.path.dirname(run_info))) setup_sample(run_info, **vars(self.pargs)) os.chdir(orig_dir) if self.pargs.only_setup: return if self.pargs.only_failed: status = {x: self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x) == "FAIL"] ## Here process files again, removing if requested, and running the pipeline for run_info in flist: self.app.log.info("Running analysis defined by config file {}".format(run_info)) os.chdir(os.path.abspath(os.path.dirname(run_info))) if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)): self.app.log.warn("Not running job") continue if self.pargs.restart: self.app.log.info("Removing old analysis files in {}".format(os.path.dirname(run_info))) remove_files(run_info, **vars(self.pargs)) (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs)) self.app.cmd.command( cl, **{"platform_args": platform_args, "saveJobId": True, "workingDirectory": os.path.dirname(run_info)} ) os.chdir(orig_dir)
def vcf_summary(self): if not self._check_pargs(["project"]): return flist = find_samples(os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id)), **vars(self.pargs)) vcf_d = get_vcf_files(flist, **vars(self.pargs)) ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file outdir = os.path.join(os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id, "intermediate", "results", "vcf"))) if not os.path.exists(outdir): self.app.cmd.safe_makedir(outdir) for k, v in vcf_d.iteritems(): # FIXME: this should be memoized if os.path.exists("{}.tbi".format(v)): self.app.log.info("{}.tbi exists; skipping bgzip and tabix operations".format(v)) continue if not v.endswith(".gz"): ## bgzip self.app.log.info("Running bgzip on {}".format(v)) cl = ["bgzip", v] self.app.cmd.command(cl) # tabix self.app.log.info("Running tabix on {}.gz".format(v)) cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)] self.app.cmd.command(cl) # Make all-variants file all_variants = os.path.join(outdir, "all-variants.vcf") cl = ['vcf-merge'] + vcf_d.values()# + [">", all_variants] if not os.path.exists(all_variants): self.app.log.info("Merging vcf files {} to {}".format(vcf_d.values() ,all_variants)) output = self.app.cmd.command(cl) with open(all_variants, "w") as fh: fh.write(output) cl = ['bgzip', all_variants] self.app.cmd.command(cl) cl = ['tabix', "-f", "-p", "vcf", "{}.gz".format(all_variants)] self.app.cmd.command(cl)
def test_sample_table(self): """Test making a sample table""" flist = find_samples(j_doe_00_01) samples = sample_table(flist) grouped = samples.groupby("sample") self.assertEqual(len(grouped.groups["P001_101_index3"]), 2) self.assertEqual(len(grouped.groups["P001_102_index6"]), 1)
def test_setup_merged_samples(self): """Test setting up merged samples""" flist = find_samples(j_doe_00_05) setup_merged_samples(flist, **{'dry_run':False}) with open(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3-bcbb-config.yaml")) as fh: conf = yaml.load(fh) self.assertEqual(conf["details"][0]["files"][0], os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz"))
def compile_qc(path, application="seqcap", **kw): """Perform qc on data without access to statusdb. :param **kw: keyword argument """ output_data = {'stdout':StringIO(), 'stderr':StringIO()} ### find_samples excrutiatingly slow for multi-sample projects where we can have > 100k files... flist = find_samples(path, **kw) srm_l = [] for f in flist: LOG.debug("Opening config file {}".format(f)) with open(f) as fh: runinfo_yaml = yaml.load(fh) for info in runinfo_yaml['details']: if info.get("multiplex", None): for mp in info.get("multiplex"): sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get("fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=mp.get("name", None), sample_prj=kw.get("project"), barcode_id=mp.get('barcode_id', None), sequence=mp.get('sequence', None)) obj = SampleRunMetrics(**sample_kw) srm_l.append(obj) else: sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get("fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=info.get("description", None), sample_prj=kw.get("project"), barcode_id=None, sequence=None) obj = SampleRunMetrics(**sample_kw) obj.read_picard_metrics() srm_l.append(obj) qcdata = [] output_data = _qc_info_header(kw.get("project"), application, output_data) for s in srm_l: qcdata.append(_srm_to_qc(s)) for v in qcdata: y = [str(x) for x in assess_qc(v, application)] output_data["stdout"].write("".join(y) + "\n") return output_data
def test_setup_samples(self): """Test setting up samples, changing genome to rn4""" flist = find_samples(j_doe_00_05) for f in flist: setup_sample(f, **{'analysis':'Align_standard_seqcap', 'genome_build':'rn4', 'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'num_cores':8, 'distributed':False}) for f in flist: with open(f, "r") as fh: config = yaml.load(fh) if config["details"][0].get("multiplex", None): self.assertEqual(config["details"][0]["multiplex"][0]["genome_build"], "rn4") else: self.assertEqual(config["details"][0]["genome_build"], "rn4") with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh: config = yaml.load(fh) self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_bait"], 'rat_baits.interval_list') self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_target"], 'rat_targets.interval_list') self.assertEqual(config["algorithm"]["num_cores"], 8) for f in flist: setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False, 'no_only_run':True, 'google_report':True, 'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':False}) with open(f, "r") as fh: config = yaml.load(fh) if config["details"][0].get("multiplex", None): self.assertEqual(config["details"][0]["multiplex"][0]["genome_build"], "rn4") else: self.assertEqual(config["details"][0]["genome_build"], "rn4") with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh: config = yaml.load(fh) self.assertEqual(config["algorithm"]["mark_duplicates"], False) self.assertEqual(config["custom_algorithms"][ANALYSIS_TYPE]["mark_duplicates"], False)
def bpreport(self): if not self._check_pargs(["project"]): return if not self.pargs.statusdb_project_name: self.pargs.statusdb_project_name = self.pargs.project kw = vars(self.pargs) basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return if self.pargs.no_statusdb: sample_name_map = None else: p_con = ProjectSummaryConnection(dbname=self.app.config.get("db", "projects"), **vars(self.app.pargs)) s_con = SampleRunMetricsConnection(dbname=self.app.config.get("db", "samples"), **vars(self.app.pargs)) try: sample_name_map = get_scilife_to_customer_name(self.pargs.statusdb_project_name, p_con, s_con, get_barcode_seq=True) except ValueError as e: self.log.warn(str(e)) self.log.warn("No such project {} defined in statusdb; try using option --statusdb_project_name".format(self.app.pargs.project)) sample_name_map = None kw.update(project_name=self.pargs.project, flist=flist, basedir=basedir, sample_name_map=sample_name_map) out_data = best_practice_note(**kw) self.log.info("Wrote report to directory {}; use Makefile to generate pdf report".format(basedir)) self.app._output_data['stdout'].write(out_data['stdout'].getvalue()) self.app._output_data['stderr'].write(out_data['stderr'].getvalue()) self.app._output_data['debug'].write(out_data['debug'].getvalue())
def hs_metrics(self): if not self._check_pargs(["project", "targets"]): return if not self.pargs.baits: self.pargs.baits = self.pargs.targets self.log.info("hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools") pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args flist = [] path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project basedir = os.path.abspath(os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) samples = find_samples(basedir, **vars(self.pargs)) inc_dirs = [os.path.dirname(x) for x in samples] flist = filtered_walk(os.path.join(self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs) if not query_yes_no("Going to run hs_metrics on {} files. Are you sure you want to continue?".format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + ["-jar", "{}/CalculateHsMetrics.jar".format(os.getenv("PICARD_HOME"))] + ["INPUT={}".format(f)] + ["TARGET_INTERVALS={}".format(os.path.abspath(self.pargs.targets))] + ["BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits))] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics"))] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def bpreport(self): if not self._check_pargs(["project"]): return kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return if self.pargs.no_statusdb: sample_name_map = None else: if not self._check_pargs(["statusdb_project_name"]): return p_con = ProjectSummaryConnection(dbname=self.app.config.get( "db", "projects"), **vars(self.app.pargs)) s_con = SampleRunMetricsConnection(dbname=self.app.config.get( "db", "samples"), **vars(self.app.pargs)) sample_name_map = get_scilife_to_customer_name( self.pargs.statusdb_project_name, p_con, s_con) kw.update(project_name=self.pargs.project, flist=flist, basedir=basedir, sample_name_map=sample_name_map) out_data = best_practice_note(**kw) self.log.info( "Wrote report to directory {}; use Makefile to generate pdf report" .format(basedir)) self.app._output_data['stdout'].write(out_data['stdout'].getvalue()) self.app._output_data['stderr'].write(out_data['stderr'].getvalue()) self.app._output_data['debug'].write(out_data['debug'].getvalue())
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath( os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format( self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join( project_path, "INBOX", self.pargs.statusdb_project_name ) if self.pargs.statusdb_project_name else os.path.join( project_path, "INBOX", self.pargs.project) if not query_yes_no( "Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath( os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write( "\n********************************\nEstimated delivery size: {:.1f}G\n********************************" .format(size / 1e9))
def test_find_samples_from_file(self): """Find samples defined in file with empty lines and erroneous names""" with open(os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"), "w") as fh: fh.write("\n") flist = find_samples(j_doe_00_05, sample=os.path.join(j_doe_00_05, "samples.txt")) validate_sample_directories(flist, j_doe_00_05) self.assertEqual(len(flist),2) os.unlink(os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"))
def test_merge_sample_config(self): """Test merging sample configuration files""" flist = find_samples(j_doe_00_05) fdict = _group_samples(flist) out_d = os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL") if not os.path.exists(out_d): os.makedirs(out_d) newconf = merge_sample_config(fdict["P001_101_index3"].values(), "P001_101_index3", out_d=out_d, dry_run=False) self.assertTrue(os.path.exists(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz" ))) self.assertTrue(os.path.exists(os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_C003CCCXX_TGACCA_L001_R1_001.fastq.gz" )))
def test_remove_dirs(self): """Test removing directories before rerunning pipeline""" keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
def test_remove_files(self): """Test removing files""" keep_files = ["-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$"] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
def vcf_summary(self): if not self._check_pargs(["project"]): return flist = find_samples( os.path.abspath( os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id)), **vars(self.pargs)) vcf_d = get_vcf_files(flist, **vars(self.pargs)) ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file outdir = os.path.join( os.path.abspath( os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id, "intermediate", "results", "vcf"))) vcf_out = [] if not os.path.exists(outdir): self.app.cmd.safe_makedir(outdir) for k, v in vcf_d.iteritems(): # FIXME: this should be memoized if os.path.exists("{}.tbi".format(v)): self.app.log.info( "{}.tbi exists; skipping bgzip and tabix operations". format(v)) vcf_out.append(v) continue if not v.endswith(".gz"): ## bgzip self.app.log.info("Running bgzip on {}".format(v)) cl = ["bgzip", v] self.app.cmd.command(cl) vcf_out.append("{}.gz".format(v)) else: vcf_out.append(v) # tabix self.app.log.info("Running tabix on {}.gz".format(v)) cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)] self.app.cmd.command(cl) # Make all-variants file all_variants = os.path.join(outdir, "all-variants.vcf") cl = ['vcf-merge'] + vcf_out if not os.path.exists(all_variants): self.app.log.debug("Merging vcf files {} to {}".format( vcf_out, all_variants)) self.app.log.info("Merging {} vcf files to {}".format( len(vcf_out), all_variants)) output = self.app.cmd.command(cl) with open(all_variants, "w") as fh: fh.write(output) cl = ['bgzip', all_variants] self.app.cmd.command(cl) cl = ['tabix', "-f", "-p", "vcf", "{}.gz".format(all_variants)] self.app.cmd.command(cl)
def test_global_post_process(self): """Test that when using a "global" post_process, jobname, output, error and output directory are updated. """ flist = find_samples(j_doe_00_05) pp = os.path.join(j_doe_00_01, SAMPLES[1], FLOWCELL, "{}-post_process.yaml".format(SAMPLES[1])) with open(pp) as fh: postprocess = yaml.load(fh) for f in flist: (cl, platform_args) = run_bcbb_command(f, pp) self.assertIn("--error", platform_args) self.assertEqual(platform_args[platform_args.index("--error") + 1], f.replace("-bcbb-config.yaml", "-bcbb.err"))
def test_find_samples_from_file(self): """Find samples defined in file with empty lines and erroneous names""" with open( os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"), "w") as fh: fh.write("\n") flist = find_samples(j_doe_00_05, sample=os.path.join(j_doe_00_05, "samples.txt")) validate_sample_directories(flist, j_doe_00_05) self.assertEqual(len(flist), 2) os.unlink(os.path.join(j_doe_00_05, "P001_101_index3-bcbb-config.yaml"))
def test_setup_merged_samples(self): """Test setting up merged samples""" flist = find_samples(j_doe_00_05) setup_merged_samples(flist, **{'dry_run': False}) with open( os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3-bcbb-config.yaml")) as fh: conf = yaml.load(fh) self.assertEqual( conf["details"][0]["files"][0], os.path.join( j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz"))
def hs_metrics(self): if not self._check_pargs(["project", "targets"]): return if not self.pargs.baits: self.pargs.baits = self.pargs.targets self.log.info( "hs_metrics: This is a temporary solution for calculating hs metrics for samples using picard tools" ) pattern = "{}.bam$".format(self.pargs.hs_file_type) def filter_fn(f): return re.search(pattern, f) != None ### FIX ME: this isn't caught by _process_args flist = [] path = self.pargs.flowcell if self.pargs.flowcell else self.pargs.project basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) samples = find_samples(basedir, **vars(self.pargs)) inc_dirs = [os.path.dirname(x) for x in samples] flist = filtered_walk( os.path.join( self.config.get(self.app.controller._meta.label, "root"), path), filter_fn=filter_fn, exclude_dirs=['nophix', 'alignments', 'fastqc', 'fastq_screen'], include_dirs=inc_dirs) if not query_yes_no( "Going to run hs_metrics on {} files. Are you sure you want to continue?" .format(len(flist)), force=self.pargs.force): return for f in flist: self.log.info("running CalculateHsMetrics on {}".format(f)) ### Issue with calling java from ### subprocess:http://stackoverflow.com/questions/9795249/issues-with-wrapping-java-program-with-pythons-subprocess-module ### Actually not an issue: command line arguments have to be done the right way cl = ["java"] + ["-{}".format(self.pargs.java_opts)] + [ "-jar", "{}/CalculateHsMetrics.jar".format( os.getenv("PICARD_HOME")) ] + ["INPUT={}".format(f)] + [ "TARGET_INTERVALS={}".format( os.path.abspath(self.pargs.targets)) ] + [ "BAIT_INTERVALS={}".format(os.path.abspath(self.pargs.baits)) ] + ["OUTPUT={}".format(f.replace(".bam", ".hs_metrics")) ] + ["VALIDATION_STRINGENCY=SILENT"] out = self.app.cmd.command(cl) if out: self.app._output_data["stdout"].write(out.rstrip())
def best_practice(self): if not self._check_pargs(["project", "uppmax_project"]): return project_path = os.path.normpath(os.path.join("/proj", self.pargs.uppmax_project)) if not os.path.exists(project_path): self.log.warn("No such project {}; skipping".format(self.pargs.uppmax_project)) return if self.pargs.outdir: outpath = os.path.join(project_path, "INBOX", self.pargs.outdir) else: outpath = os.path.join(project_path, "INBOX", self.pargs.statusdb_project_name) if self.pargs.statusdb_project_name else os.path.join(project_path, "INBOX", self.pargs.project) if not query_yes_no("Going to deliver data to {}; continue?".format(outpath)): return if not os.path.exists(outpath): self.app.cmd.safe_makedir(outpath) kw = vars(self.pargs) basedir = os.path.abspath(os.path.join(self._meta.root_path, self._meta.path_id)) flist = find_samples(basedir, **vars(self.pargs)) if self.pargs.flowcell: flist = [ fl for fl in flist if os.path.basename(os.path.dirname(fl)) == self.pargs.flowcell ] if not len(flist) > 0: self.log.info("No samples/sample configuration files found") return def filter_fn(f): if not pattern: return return re.search(pattern, f) != None # Setup pattern plist = [".*.yaml$", ".*.metrics$"] if not self.pargs.no_bam: plist.append(".*-{}.bam$".format(self.pargs.bam_file_type)) plist.append(".*-{}.bam.bai$".format(self.pargs.bam_file_type)) if not self.pargs.no_vcf: plist.append(".*.vcf$") plist.append(".*.vcf.gz$") plist.append(".*.tbi$") plist.append(".*.tsv$") pattern = "|".join(plist) size = 0 for f in flist: path = os.path.dirname(f) sources = filtered_walk(path, filter_fn=filter_fn, exclude_dirs=BCBIO_EXCLUDE_DIRS) targets = [src.replace(basedir, outpath) for src in sources] self._transfer_files(sources, targets) if self.pargs.size: statinfo = [os.stat(src).st_size for src in sources] size = size + sum(statinfo) if self.pargs.size: self.app._output_data['stderr'].write("\n********************************\nEstimated delivery size: {:.1f}G\n********************************".format(size/1e9))
def compile_qc(path, application="seqcap", **kw): """Perform qc on data without access to statusdb. :param **kw: keyword argument """ output_data = {'stdout': StringIO(), 'stderr': StringIO()} ### find_samples excrutiatingly slow for multi-sample projects where we can have > 100k files... flist = find_samples(path, **kw) srm_l = [] for f in flist: LOG.debug("Opening config file {}".format(f)) with open(f) as fh: runinfo_yaml = yaml.load(fh) for info in runinfo_yaml['details']: if info.get("multiplex", None): for mp in info.get("multiplex"): sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get( "fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=mp.get("name", None), sample_prj=kw.get("project"), barcode_id=mp.get('barcode_id', None), sequence=mp.get('sequence', None)) obj = SampleRunMetrics(**sample_kw) srm_l.append(obj) else: sample_kw = dict(path=os.path.dirname(f), flowcell=runinfo_yaml.get("fc_name", None), date=runinfo_yaml.get("fc_date", None), lane=info.get("lane", None), barcode_name=info.get("description", None), sample_prj=kw.get("project"), barcode_id=None, sequence=None) obj = SampleRunMetrics(**sample_kw) obj.read_picard_metrics() srm_l.append(obj) qcdata = [] output_data = _qc_info_header(kw.get("project"), application, output_data) for s in srm_l: qcdata.append(_srm_to_qc(s)) for v in qcdata: y = [str(x) for x in assess_qc(v, application)] output_data["stdout"].write("".join(y) + "\n") return output_data
def test_bcbb_command(self): """Test output from command, changing analysis to amplicon and setting targets and baits""" flist = find_samples(j_doe_00_05) for f in flist: setup_sample( f, **{ 'analysis': ANALYSIS_TYPE, 'genome_build': 'rn4', 'dry_run': False, 'no_only_run': False, 'google_report': False, 'dry_run': False, 'baits': 'rat_baits.interval_list', 'targets': 'rat_targets.interval_list', 'amplicon': True, 'num_cores': 8, 'distributed': False }) with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh: cl = fh.read().split() (cl, platform_args) = run_bcbb_command(f) self.assertIn("automated_initial_analysis.py", cl) setup_sample( f, **{ 'analysis': ANALYSIS_TYPE, 'genome_build': 'rn4', 'dry_run': False, 'no_only_run': False, 'google_report': False, 'dry_run': False, 'baits': 'rat_baits.interval_list', 'targets': 'rat_targets.interval_list', 'amplicon': True, 'num_cores': 8, 'distributed': True }) with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh: cl = fh.read().split() (cl, platform_args) = run_bcbb_command(f) self.assertIn("distributed_nextgen_pipeline.py", cl)
def test_remove_files(self): """Test removing files""" keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$", "^[0-9][0-9]_.*.txt$" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_files = filtered_walk(workdir, remove_filter_fn) self.assertNotIn("01_analysis_start.txt", [os.path.basename(x) for x in remove_files])
def test_bcbb_command(self): """Test output from command, changing analysis to amplicon and setting targets and baits""" flist = find_samples(j_doe_00_05) for f in flist: setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False, 'no_only_run':False, 'google_report':False, 'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':False}) with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh: cl = fh.read().split() (cl, platform_args) = run_bcbb_command(f) self.assertIn("automated_initial_analysis.py",cl) setup_sample(f, **{'analysis':ANALYSIS_TYPE, 'genome_build':'rn4', 'dry_run':False, 'no_only_run':False, 'google_report':False, 'dry_run':False, 'baits':'rat_baits.interval_list', 'targets':'rat_targets.interval_list', 'amplicon':True, 'num_cores':8, 'distributed':True}) with open(f.replace("-bcbb-config.yaml", "-bcbb-command.txt")) as fh: cl = fh.read().split() (cl, platform_args) = run_bcbb_command(f) self.assertIn("distributed_nextgen_pipeline.py",cl)
def test_remove_dirs(self): """Test removing directories before rerunning pipeline""" keep_files = [ "-post_process.yaml$", "-post_process.yaml.bak$", "-bcbb-config.yaml$", "-bcbb-config.yaml.bak$", "-bcbb-command.txt$", "-bcbb-command.txt.bak$", "_[0-9]+.fastq$", "_[0-9]+.fastq.gz$" ] pattern = "|".join(keep_files) def remove_filter_fn(f): return re.search(pattern, f) == None flist = find_samples(j_doe_00_05) for f in flist: workdir = os.path.dirname(f) remove_dirs = filtered_walk(workdir, remove_filter_fn, get_dirs=True) self.assertIn("fastqc", [os.path.basename(x) for x in remove_dirs])
def test_merge_sample_config(self): """Test merging sample configuration files""" flist = find_samples(j_doe_00_05) fdict = _group_samples(flist) out_d = os.path.join(j_doe_00_05, "P001_101_index3", "TOTAL") if not os.path.exists(out_d): os.makedirs(out_d) newconf = merge_sample_config(fdict["P001_101_index3"].values(), "P001_101_index3", out_d=out_d, dry_run=False) self.assertTrue( os.path.exists( os.path.join( j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_B002BBBXX_TGACCA_L001_R1_001.fastq.gz"))) self.assertTrue( os.path.exists( os.path.join( j_doe_00_05, "P001_101_index3", "TOTAL", "P001_101_index3_C003CCCXX_TGACCA_L001_R1_001.fastq.gz")))
def vcf_summary(self): if not self._check_pargs(["project"]): return flist = find_samples( os.path.abspath(os.path.join(self.app.controller._meta.project_root, self.app.controller._meta.path_id)), **vars(self.pargs) ) vcf_d = get_vcf_files(flist) ## Traverse files, copy to result directory, run bgzip and tabix, and merge vcfs to one file outdir = os.path.join( os.path.abspath( os.path.join( self.app.controller._meta.project_root, self.app.controller._meta.path_id, "intermediate", "results", "vcf", ) ) ) if not os.path.exists(outdir): self.app.cmd.safe_makedir(outdir) for k, v in vcf_d.iteritems(): print v if v.endswith(".gz"): tgt = os.path.join(outdir, os.path.basename(v).replace("TOTAL", "TOTAL_{}".format(k))) v = v.replace(".gz", "") tgt = tgt.replace(".gz", "") else: ## bgzip LOG.info("Running bgzip on {}".format(v)) cl = ["bgzip", v] self.app.cmd.command(cl) ##if not os.path.exists("{}.gz.tbi"): ## tabix LOG.info("Running tabix on {}.gz".format(v)) cl = ["tabix", "-f", "-p", "vcf", "{}.gz".format(v)] self.app.cmd.command(cl) self.app.cmd.link("{}.gz".format(v), "{}.gz".format(tgt)) self.app.cmd.link("{}.gz.tbi".format(v), "{}.gz.tbi".format(tgt))
def test_find_samples(self): """Test finding samples""" flist = find_samples(j_doe_00_05) self.assertIn(len(flist), [3, 4]) flist = find_samples(j_doe_00_05, **{'only_failed': True}) self.assertIn(len(flist), [0, 1])
def test_find_samples_from_file_with_yaml(self): """Find samples defined in file with empty lines and a bcbb-config.yaml file lying directly under root directory""" flist = find_samples(j_doe_00_05, sample=os.path.join(j_doe_00_05, "samples2.txt")) args = [flist, j_doe_00_05] self.assertRaises(Exception, validate_sample_directories, *args)
def run(self): if not self._check_pargs(["project"]): return if self.pargs.post_process: self.pargs.post_process = os.path.abspath(self.pargs.post_process) basedir = os.path.abspath( os.path.join(self.app.controller._meta.root_path, self.app.controller._meta.path_id)) if self.pargs.from_ssheet: [ samplesheet_csv_to_yaml(fn) for fn in find_samples( basedir, pattern="SampleSheet.csv$", **vars(self.pargs)) ] flist = find_samples(basedir, **vars(self.pargs)) # Add filtering on flowcell if necessary self._meta.pattern = ".*" flist = [x for x in flist if self._filter_fn(x)] if self.pargs.merged: ## Setup merged samples and append to flist if new list longer flist = setup_merged_samples(flist, **vars(self.pargs)) if not len(flist) > 0: self.log.info("No sample configuration files found") return if len(flist) > 0 and not query_yes_no( "Going to start {} jobs... Are you sure you want to continue?". format(len(flist)), force=self.pargs.force): return # Make absolutely sure analysis directory is a *subdirectory* of the working directory validate_sample_directories(flist, basedir) orig_dir = os.path.abspath(os.getcwd()) for run_info in flist: os.chdir(os.path.abspath(os.path.dirname(run_info))) setup_sample(run_info, **vars(self.pargs)) os.chdir(orig_dir) if self.pargs.only_setup: return if self.pargs.only_failed: status = {x: self._sample_status(x) for x in flist} flist = [x for x in flist if self._sample_status(x) == "FAIL"] ## Here process files again, removing if requested, and running the pipeline for run_info in flist: self.app.log.info( "Running analysis defined by config file {}".format(run_info)) os.chdir(os.path.abspath(os.path.dirname(run_info))) if self.app.cmd.monitor(work_dir=os.path.dirname(run_info)): self.app.log.warn("Not running job") continue if self.pargs.restart: self.app.log.info("Removing old analysis files in {}".format( os.path.dirname(run_info))) remove_files(run_info, **vars(self.pargs)) (cl, platform_args) = run_bcbb_command(run_info, **vars(self.pargs)) self.app.cmd.command( cl, **{ 'platform_args': platform_args, 'saveJobId': True, 'workingDirectory': os.path.dirname(run_info) }) os.chdir(orig_dir)
def test_summarize_variants(self): """Test summarizing variants""" flist = find_samples(j_doe_00_01) vcf_d = get_vcf_files(flist)
def test_setup_samples(self): """Test setting up samples, changing genome to rn4""" flist = find_samples(j_doe_00_05) for f in flist: setup_sample( f, **{ 'analysis': 'Align_standard_seqcap', 'genome_build': 'rn4', 'dry_run': False, 'baits': 'rat_baits.interval_list', 'targets': 'rat_targets.interval_list', 'num_cores': 8, 'distributed': False }) for f in flist: with open(f, "r") as fh: config = yaml.load(fh) if config["details"][0].get("multiplex", None): self.assertEqual( config["details"][0]["multiplex"][0]["genome_build"], "rn4") else: self.assertEqual(config["details"][0]["genome_build"], "rn4") with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh: config = yaml.load(fh) self.assertEqual( config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_bait"], 'rat_baits.interval_list') self.assertEqual( config["custom_algorithms"][ANALYSIS_TYPE]["hybrid_target"], 'rat_targets.interval_list') self.assertEqual(config["algorithm"]["num_cores"], 8) for f in flist: setup_sample( f, **{ 'analysis': ANALYSIS_TYPE, 'genome_build': 'rn4', 'dry_run': False, 'no_only_run': True, 'google_report': True, 'dry_run': False, 'baits': 'rat_baits.interval_list', 'targets': 'rat_targets.interval_list', 'amplicon': True, 'num_cores': 8, 'distributed': False }) with open(f, "r") as fh: config = yaml.load(fh) if config["details"][0].get("multiplex", None): self.assertEqual( config["details"][0]["multiplex"][0]["genome_build"], "rn4") else: self.assertEqual(config["details"][0]["genome_build"], "rn4") with open(f.replace("-bcbb-config.yaml", "-post_process.yaml")) as fh: config = yaml.load(fh) self.assertEqual(config["algorithm"]["mark_duplicates"], False) self.assertEqual( config["custom_algorithms"][ANALYSIS_TYPE]["mark_duplicates"], False)
def test_find_samples(self): """Test finding samples""" flist = find_samples(j_doe_00_05) self.assertIn(len(flist), [3,4]) flist = find_samples(j_doe_00_05, **{'only_failed':True}) self.assertIn(len(flist), [0,1])
def test_setup_merged_samples(self): """Test setting up merged samples""" flist = find_samples(j_doe_00_05) setup_merged_samples(flist, **{'dry_run':False})