def setUp(self): self.dir_n = os.path.dirname(__file__) self.config_fn = os.path.join(self.dir_n, "data", "laims.json") self.laimsapp = LaimsApp(config_file=self.config_fn) self.laimsapp.share_dir = os.path.join(os.path.dirname(self.dir_n), "share")
def cohorts_link_cmd(name,samples): """ Link samples to a cohort """ sm = LaimsApp().db_connection() session = sm() if len(samples) == 1 and os.path.exists(samples[0]): with open(samples[0], "r") as f: names_to_link = set() for l in f.readlines(): names_to_link.add( l.rstrip() ) else: names_to_link = set(samples) stats = { "add": 0, "skip": 0 } for sample_name in names_to_link: try: sample = session.query(ComputeWorkflowSample).filter_by(ingest_sample_name=sample_name).one() except NoResultFound: raise Exception("Could not find sample named {}".format(sample_name)) cohort = session.query(SampleCohort).get((sample.id, name)) if cohort is None: cohort = SampleCohort(name=name, sample_id=sample.id) session.add(cohort) stats["add"] += 1 else: stats["skip"] += 1 session.commit() sys.stderr.write("Added {} samples to cohort {}, skipped {} existing.".format(stats["add"], name, stats["skip"]))
def test2_laims_metrics_add(self): runner = CliRunner() result = runner.invoke(metrics_add_cmd, ["--help"]) self.assertEqual(result.exit_code, 0) sm = LaimsApp().db_connection() session = sm() for sample in session.query(ComputeWorkflowSample): sample.analysis_cram_path = self.sample_dn session.add(sample) session.flush() session.commit() result = runner.invoke(metrics_add_cmd) try: self.assertEqual(result.exit_code, 0) expected_output = """STATUS: NO_DIR: 0 NO_PICARD_WGS: 0 NO_VERIFY_BAMID: 0 OK: 10 """ self.assertEqual(result.output, expected_output) except: print(result.output) raise
def cli(ctx, config, database, job_group, queue, job_stdout): conf = { "database": database, "job_group": job_group, "queue": queue, "stdout": job_stdout, } ctx.obj = LaimsApp(config_file=config, config=conf)
def setUp(self): self.data_d = os.path.join(os.path.dirname(__file__), "data") self.temp_d = tempfile.TemporaryDirectory() self.database_fn = os.path.join(self.temp_d.name, "test.db") shutil.copyfile(os.path.join(self.data_d, "test.db"), self.database_fn) laimsapp = LaimsApp() laimsapp.database = self.database_fn
def test_sample_cohort(self): sm = LaimsApp().db_connection() session = sm() cohorts = session.query(SampleCohort).filter_by(sample_id=8).all() self.assertEqual(len(cohorts), 1) sample = cohorts[0].sample self.assertEqual(sample.ingest_sample_name, "H_XY-BGM1073006") self.assertEqual(sample.ingest_sample_name, cohorts[0].sample.ingest_sample_name)
def test_sample_file(self): sm = LaimsApp().db_connection() session = sm() files = session.query(SampleFile).filter_by(sample_id=8).all() self.assertEqual(len(files), 2) sample = files[0].sample self.assertEqual(sample.ingest_sample_name, "H_XY-BGM1073006") self.assertEqual(sample.ingest_sample_name, files[0].sample.ingest_sample_name)
def test2_generic_rsync_command(self): laimsapp = LaimsApp() cmd = GenericRsyncCmd() cmdline = [ 'rsync', '--verbose', '--archive', 'INPUT1', 'INPUT2', 'INPUT3', 'OUTDIR/' ] self.assertEqual(cmd(['INPUT1', 'INPUT2', 'INPUT3'], 'OUTDIR'), cmdline)
def test_sample_metric(self): sm = LaimsApp().db_connection() session = sm() metrics = session.query(SampleMetric).filter_by(sample_id=8).all() self.assertEqual(len(metrics), 2) sample = metrics[0].sample self.assertEqual(sample.ingest_sample_name, "H_XY-BGM1073006") self.assertEqual(sample.ingest_sample_name, metrics[1].sample.ingest_sample_name)
def __init__(self, reference): app = LaimsApp() cmd_conf = app.rewrite_gvcfs self.cmd = 'java -Xmx{max_mem} -Xms{max_stack} -jar {gatk_jar} -T CombineGVCFs -R {ref} --breakBandsAtMultiplesOf {break_multiple} -V {{input}} -o {{temp_output}} && mv {{temp_output}} {{output}} && mv {{temp_output}}.tbi {{output}}.tbi'.format( max_mem=cmd_conf["max_mem"], max_stack=cmd_conf["max_stack"], gatk_jar=app.gatk_jar, ref=str(reference), break_multiple=cmd_conf['break_multiple'], )
def test3_lims_db(self): laimsapp = LaimsApp(config_file=self.config_fn, config={"database": "NOTDB"}) self.assertIsNotNone(laimsapp) lims_db_url = laimsapp.lims_db_url self.assertIsNotNone(lims_db_url) LaimsApp.lims_db_url = None self.assertIsNone(laimsapp.lims_db_url) with self.assertRaisesRegex(Exception, "No lims_db_url"): laimsapp.lims_db_connection()
def cohorts_list_cmd(): """ List cohorts and sample counts """ sm = LaimsApp().db_connection() session = sm() sql = "select name, count(*) as sample_count from sample_cohorts group by name" result = session.execute(sql) rows = [] for cohort in result.fetchall(): rows += [list(map(str, cohort))] sys.stdout.write( tabulate.tabulate(rows, ["NAME", "SAMPLE_COUNT"], tablefmt="simple") )
def test1_rewrite_gvcf_cmd(self): laimsapp = LaimsApp(config_file=os.path.join(os.path.dirname(__file__), "data", "laims.json")) cmd = RewriteGvcfCmd( reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', ) self.assertIsNotNone(cmd) cmdline = cmd('input.gvcf.gz', 'output.gvcf.gz') self.assertEqual( 'java -Xmx3500M -Xms3500M -jar /gatk/gatk-package-4.0.6.0-local.jar -T CombineGVCFs -R /gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa --breakBandsAtMultiplesOf 1000000 -V input.gvcf.gz -o output.gvcf.gz.tmp.vcf.gz && mv output.gvcf.gz.tmp.vcf.gz output.gvcf.gz && mv output.gvcf.gz.tmp.vcf.gz.tbi output.gvcf.gz.tbi', cmdline)
def setUp(self): self.data_dn = os.path.join(os.path.dirname(__file__), "data") self.sample_dn = os.path.join(self.data_dn, "samples", "H_XS-356091-0186761975") self.temp_d = tempfile.TemporaryDirectory() self.database_fn = os.path.join(self.temp_d.name, "test.db") shutil.copyfile(os.path.join(self.data_dn, "test.db"), self.database_fn) laimsapp = LaimsApp() laimsapp.database = self.database_fn
def test2_init(self): # init w/o config laimsapp = LaimsApp() self.assertIsNotNone(LaimsApp.context) # init w/ config LaimsApp.context = None # reset laimsapp = LaimsApp(config_file=self.config_fn, config={"database": "NOTDB"}) self.assertIsNotNone(laimsapp) self.assertIsNotNone(laimsapp.context) self.assertEqual(laimsapp.config_file, self.config_fn) self.assertEqual(laimsapp.environment, 'test') self.assertEqual(laimsapp.database, 'NOTDB') self.assertEqual(laimsapp.lims_db_url, 'sqlite:///:memory:') # __setattr__ self.assertIsNone(laimsapp.foo) laimsapp.foo = "bar" self.assertEqual(laimsapp.foo, "bar") self.assertEqual(LaimsApp().foo, "bar")
def test2_lsf_job(self, subprocess_patch): laimsapp = LaimsApp() config = laimsapp.lsf_job_options() config.pop("queue", None) config.pop("stdout", None) print(config) job = LsfJob(config) self.assertTrue(isinstance(job, LsfJob)) available_opts = LsfJob.available_options self.assertEqual(len(available_opts), 9, "available options count is 9") expected_cmd = [ 'bsub', '-a', 'docker(registry.gsc.wustl.edu/mgi/laims:latest)', '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual(job.bsub_cmd(['echo', 'hello', 'world']), expected_cmd) job.created_options["stdout"] = "/var/log/out" expected_cmd = [ 'bsub', '-M', '10000000', '-R', '"select[mem>10000] rusage[mem=10000]"', '-a', 'docker(hello-world)', "-oo", "/var/log/out", '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual( job.bsub_cmd(['echo', 'hello', 'world'], { "docker": "hello-world", "memory_in_gb": 10 }), expected_cmd) job.created_options["stdout"] = "/var/log" expected_cmd = [ 'bsub', '-M', '10000000', '-R', '"select[mem>10000] rusage[mem=10000]"', '-a', 'docker(hello-world)', "-oo", "/var/log/log1.out", '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual( job.bsub_cmd( ['echo', 'hello', 'world'], { "docker": "hello-world", "memory_in_gb": 10, "stdout_bn": "log1.out" }), expected_cmd) subprocess_patch.return_value = 1 self.assertFalse( job.launch(['echo', 'hello', 'world'], {"docker": "hello-world"}), expected_cmd)
def test4_job_options(self): laimsapp = LaimsApp(config_file=self.config_fn) laimsapp.queue = "ccdg" laimsapp.stdout = "/var/log/out" self.assertTrue(laimsapp) opts = laimsapp.lsf_job_options() expected_opts = { "email": "*****@*****.**", "queue": "ccdg", "docker": "registry.gsc.wustl.edu/mgi/laims:latest", "stdout": "/var/log/out", } self.assertDictEqual(opts, expected_opts, "LSF job options fetched from config")
def get_read_counts(self, read_groups): sql = """ select ii.seq_id, ii.filt_clusters * 2 as total_clusters from index_illumina ii where ii.seq_id in ( {{ seq_ids | join(', ') }} ) """ template = Template(sql) rendered_sql = template.render(seq_ids=read_groups) db = LaimsApp().lims_db_connection() result = db.query(rendered_sql) data = [{ 'seq_id': row['seq_id'], 'filt_clusters': row['total_clusters'] } for row in result] return data
def metrics_add_cmd(): """ Add Sample Metrics Into DB Currently adds these metrics from ALL samples: * picard wgs - mean coverage * verify bam id - freemix """ db = LaimsApp().db_connection() session = db() status = {"OK": 0, "NO_DIR": 0, "NO_VERIFY_BAMID": 0, "NO_PICARD_WGS": 0} for sample in session.query(ComputeWorkflowSample): dn = sample.analysis_cram_path qc_dn = os.path.join(dn, "qc") if not os.path.exists(dn) or not os.path.exists(qc_dn): status["NO_DIR"] += 1 continue # verifyBamID qc = QcMetrics(dn=qc_dn) try: verifyBamID_metrics = qc.verifyBamID_metrics() except: status["NO_VERIFY_BAMID"] += 1 continue _add_or_update_metrics(session=session, sample=sample, metrics=verifyBamID_metrics, names=["FREEMIX"]) # picard wgs try: picard_wgs_metrics = qc.picard_wgs_metrics() except: status["NO_PICARD_WGS"] += 1 continue _add_or_update_metrics(session=session, sample=sample, metrics=picard_wgs_metrics, names=["MEAN_COVERAGE"]) status["OK"] += 1 sys.stderr.write("STATUS:\n" + yaml.dump(status, indent=6))
def verify_bulk_gvcfs(tsv_path, reference_path): os.environ['LSF_DOCKER_PRESERVE_ENVIRONMENT'] = 'false' job_opts = LaimsApp().lsf_job_options() job_opts["memory_in_gb"] = 10 job_runner = LsfJob(job_opts) with open(tsv_path) as f: reader = csv.reader(f, delimiter='\t') for row in reader: interval = get_interval_from_path(row[0]) cmd = [ "laims", "verify-gvcf", "--gvcf-path", row[0], "--reference-path", reference_path, "--interval", interval ] job_runner.launch(cmd, cmd_options={ "stdbn": ".".join([os.path.basename(row[0]), "out"]) })
def sample_list_cmd(filter_by): """ List samples and show their attributes """ sm = LaimsApp().db_connection() session = sm() if filter_by is not None: sample_iter = session.query(ComputeWorkflowSample).filter_by( source_work_order=filter_by) else: sample_iter = session.query(ComputeWorkflowSample) rows = [] for sample in sample_iter: rows += [ map(str, [ sample.id, sample.ingest_sample_name, sample.source_work_order ]) ] sys.stdout.write( tabulate.tabulate(rows, ["ID", "NAME", "WORK_ORDER"], tablefmt="simple"))
def update_cmd(fof, key): """ Update Samples Files Give an FOF of files to update. The sample name should be derivable from the filename. It not giving the --key option, the extension will be used as the file's key. """ sm = LaimsApp().db_connection() session = sm() with open(fof, "r") as f: for fn in f.readlines(): fn = fn.rstrip() bn = os.path.basename(fn) tokens = bn.split(".") sample_name = tokens[0] sample = session.query(ComputeWorkflowSample).filter_by( ingest_sample_name=sample_name).first() if sample is None: sys.stderr.write("NO_SAMPLE {}\n".format(sample_name, fn)) continue _key = key if _key is None: _key = tokens[-1] if not _key in valid_keys: sys.stderr.write("INVALID_KEY {} {}\n".format(_key, fn)) continue sample_file = session.query(SampleFile).get((sample.id, _key)) if sample_file is not None: sample_file.value = fn else: #sample_file = SampleFile(sample.id, key, fn) sample_file = SampleFile(sample_id=sample.id, name=_key, value=fn) session.add(sample_file) sys.stderr.write("OK {} {} {}\n".format(sample_name, _key, fn)) session.commit()
def setUp(self): self.laimsapp = LaimsApp(config_file=os.path.join( os.path.dirname(__file__), "data", "laims.json"))
def setUp(self): self.data_d = os.path.join(os.path.dirname(__file__), "data") self.tsv_path = os.path.join(self.data_d, "gvcfs.tsv") self.ref_fn = os.path.join(self.data_d, "ref.fa") LaimsApp(config={"queue": "ccdg"})
def test_sample(self): sm = LaimsApp().db_connection() session = sm() sample = session.query(ComputeWorkflowSample).get(8) self.assertIsNotNone(sample)
def downsample_and_recall(app, inputs, output_dir): log_dir = os.path.join(output_dir, 'logs') os.mkdir(log_dir) os.mkdir(os.path.join(output_dir, 'results')) cromwell_job_opts = { 'memory_in_gb' : 32, 'queue': app.queue, 'docker': app.docker, 'stdout': os.path.join(log_dir, 'cromwell.log'), } if app.job_group is not None: cromwell_job_opts['group'] = app.job_group job_runner=LsfJob(cromwell_job_opts) chrs = [ (["chr{}".format(c)]) for c in range(1,23) ] chrs.extend([ ["chrX"], ["chrY"], ["/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.filtered-chromosome.ext.list"] ]) workflow_inputs = { 'reference': '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', 'downsample_strategy': 'ConstantMemory', 'downsample_seed': 1, 'emit_reference_confidence': 'GVCF', 'max_alternate_alleles': 3, 'variant_index_type': 'LINEAR', 'variant_index_parameter': 128000, 'read_filter': 'OverclippedRead', 'intervals': chrs, 'qc_minimum_mapping_quality': 0, 'qc_minimum_base_quality': 0, 'crams_to_downsample': [], #filled in from "inputs" file below } with open(inputs) as fh: reader = csv.reader(fh, delimiter='\t') for row in reader: sam = row[0] ratio = row[1] freemix = row[2] workflow_inputs['crams_to_downsample'].append( { 'cram': {'class': 'File', 'path': sam}, 'downsample_ratio': ratio, 'contamination': freemix } ) input_yaml_path = os.path.join(output_dir, 'inputs.yaml') with open(input_yaml_path, 'w') as yaml_fh: yaml.dump(workflow_inputs, yaml_fh) config_template = os.path.join(LaimsApp().share_dir, 'cromwell.config.jinja') fs_loader = FileSystemLoader(searchpath = os.path.join(LaimsApp().share_dir)) env = Environment(loader=fs_loader, autoescape=True) template = env.get_template('cromwell.config.jinja') cromwell_config_path = os.path.join(output_dir, 'cromwell.config') template.stream( log_dir=log_dir, output_dir=output_dir, lsf_queue=app.queue, lsf_job_group=app.job_group, ).dump(cromwell_config_path) cmd = [ '/usr/bin/java', '-Dconfig.file=' + cromwell_config_path, '-Xmx24g', '-jar', '/opt/cromwell.jar', 'run', '-t', 'cwl', '-i', input_yaml_path, 'https://raw.githubusercontent.com/tmooney/cancer-genomics-workflow/downsample_and_recall/definitions/pipelines/gathered_downsample_and_recall.cwl' #TODO get a more canonical URL once things are merged ] job_runner.launch(cmd)
def test1_init_fails(self): # FIXME This tests does not raise an exception in the test suite, dunno why with self.assertRaisesRegex( Exception, "Given config file /laims.json does not exist!"): laimsapp = LaimsApp(config_file="/laims.json")