def ingest(app, csv_file, output_dir, force=False): Session = open_db(app.database) default_job_options = { 'memory_in_gb': 5, 'queue': app.queue, 'docker': app.docker, } if app.job_group is not None: default_job_options['group'] = app.job_group preprocessor = B38Preprocessor(output_dir, job_runner=LsfJob(default_job_options), force=force) columns = { 'Compute Workflow Execution': 'compute_workflow_execution', 'Work Order': 'work_order', 'DNA': 'ingest_sample_name', 'WOI': 'woi', 'Working Directory': 'source_directory' } seen = set() with open(csv_file) as f: reader = csv.DictReader(f, delimiter=',') for row in reader: output_json = dict() for key in columns: output_json[columns[key]] = row[key] seen_key = (output_json['source_directory'], output_json['ingest_sample_name'], output_json['work_order']) if seen_key in seen: logger.info( 'Duplicate row with identical source directory, sample name and workorder. Skipping...' ) continue else: seen.add(seen_key) outdir = preprocessor(output_json['source_directory']) is_valid = False analysis_cram_path = None analysis_gvcf_path = None if outdir is not None: is_valid = True analysis_cram_path = outdir analysis_gvcf_path = outdir session = Session() session.add( ComputeWorkflowSample( source_work_order=output_json['work_order'], ingest_sample_name=output_json['ingest_sample_name'], source_directory=output_json['source_directory'], woi=output_json['woi'], valid_source_directory=is_valid, analysis_cram_path=analysis_cram_path, analysis_gvcf_path=analysis_gvcf_path)) session.commit()
def test2_lsf_job(self, subprocess_patch): laimsapp = LaimsApp() config = laimsapp.lsf_job_options() config.pop("queue", None) config.pop("stdout", None) print(config) job = LsfJob(config) self.assertTrue(isinstance(job, LsfJob)) available_opts = LsfJob.available_options self.assertEqual(len(available_opts), 9, "available options count is 9") expected_cmd = [ 'bsub', '-a', 'docker(registry.gsc.wustl.edu/mgi/laims:latest)', '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual(job.bsub_cmd(['echo', 'hello', 'world']), expected_cmd) job.created_options["stdout"] = "/var/log/out" expected_cmd = [ 'bsub', '-M', '10000000', '-R', '"select[mem>10000] rusage[mem=10000]"', '-a', 'docker(hello-world)', "-oo", "/var/log/out", '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual( job.bsub_cmd(['echo', 'hello', 'world'], { "docker": "hello-world", "memory_in_gb": 10 }), expected_cmd) job.created_options["stdout"] = "/var/log" expected_cmd = [ 'bsub', '-M', '10000000', '-R', '"select[mem>10000] rusage[mem=10000]"', '-a', 'docker(hello-world)', "-oo", "/var/log/log1.out", '-N', '-u', '*****@*****.**', 'echo', 'hello', 'world' ] self.assertEqual( job.bsub_cmd( ['echo', 'hello', 'world'], { "docker": "hello-world", "memory_in_gb": 10, "stdout_bn": "log1.out" }), expected_cmd) subprocess_patch.return_value = 1 self.assertFalse( job.launch(['echo', 'hello', 'world'], {"docker": "hello-world"}), expected_cmd)
def verify_bulk_gvcfs(tsv_path, reference_path): os.environ['LSF_DOCKER_PRESERVE_ENVIRONMENT'] = 'false' job_opts = LaimsApp().lsf_job_options() job_opts["memory_in_gb"] = 10 job_runner = LsfJob(job_opts) with open(tsv_path) as f: reader = csv.reader(f, delimiter='\t') for row in reader: interval = get_interval_from_path(row[0]) cmd = [ "laims", "verify-gvcf", "--gvcf-path", row[0], "--reference-path", reference_path, "--interval", interval ] job_runner.launch(cmd, cmd_options={ "stdbn": ".".join([os.path.basename(row[0]), "out"]) })
def oldband(app, output_dir, workorders): os.environ['LSF_NO_INHERIT_ENVIRONMENT'] = 'true' default_job_options = { 'memory_in_gb': 10, 'queue': app.queue, 'docker': 'registry.gsc.wustl.edu/genome/gatk-3.5-0-g36282e4:1', } if app.job_group is not None: default_job_options['group'] = app.job_group job_runner = LsfJob(default_job_options) logdir = os.path.join(output_dir, 'log') Session = open_db(app.database) cmd = OldbandandRewriteGvcfCmd( java='/usr/bin/java', max_mem='8G', max_stack='8G', gatk_jar='/opt/GenomeAnalysisTK.jar', reference= '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', break_multiple=1000000) for wo in workorders: session = Session() for sample in session.query(ComputeWorkflowSample).filter( ComputeWorkflowSample.source_work_order == wo): if (sample.analysis_cram_verifyed): cram_path = sample.analysis_cram_path sample_name = os.path.basename(cram_path) cram_file = os.path.join(sample.analysis_cram_path, '{}.cram'.format(sample_name)) oldband_path = os.path.join(sample.analysis_gvcf_path, 'oldbanded_gvcfs') force_make_dirs(oldband_path) stdout_dir = os.path.join(logdir, sample_name) for chrom in chromosomes: new_gzvcf = '{0}.{1}.g.vcf.gz'.format(sample_name, chrom) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') cmdline = cmd(cram_file, output_gzvcf, chrom) lsf_options = { 'stdout': stdout, } job_runner.launch(cmdline, lsf_options) # do ext chrom_string = ' -L '.join(ext_chromosomes) new_gzvcf = '{0}.extChr.g.vcf.gz'.format(sample_name) output_gzvcf = os.path.join(oldband_path, new_gzvcf) if not os.path.exists(output_gzvcf) or not os.path.exists( output_gzvcf + '.tbi'): script = os.path.join(oldband_path, 'oldband_extChr.sh') cmdline = cmd(cram_file, output_gzvcf, chrom_string) cmdline += ' && rm -f {0}'.format(script) with open(script, 'w') as f: f.write('#!/bin/bash\n') f.write(cmdline) f.write('\n') stdout = os.path.join(stdout_dir, new_gzvcf + '.oldbanded.log') lsf_options = { 'stdout': stdout, } job_runner.launch('/bin/bash {0}'.format(script), lsf_options)
def downsample_and_recall(app, inputs, output_dir): log_dir = os.path.join(output_dir, 'logs') os.mkdir(log_dir) os.mkdir(os.path.join(output_dir, 'results')) cromwell_job_opts = { 'memory_in_gb' : 32, 'queue': app.queue, 'docker': app.docker, 'stdout': os.path.join(log_dir, 'cromwell.log'), } if app.job_group is not None: cromwell_job_opts['group'] = app.job_group job_runner=LsfJob(cromwell_job_opts) chrs = [ (["chr{}".format(c)]) for c in range(1,23) ] chrs.extend([ ["chrX"], ["chrY"], ["/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.filtered-chromosome.ext.list"] ]) workflow_inputs = { 'reference': '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa', 'downsample_strategy': 'ConstantMemory', 'downsample_seed': 1, 'emit_reference_confidence': 'GVCF', 'max_alternate_alleles': 3, 'variant_index_type': 'LINEAR', 'variant_index_parameter': 128000, 'read_filter': 'OverclippedRead', 'intervals': chrs, 'qc_minimum_mapping_quality': 0, 'qc_minimum_base_quality': 0, 'crams_to_downsample': [], #filled in from "inputs" file below } with open(inputs) as fh: reader = csv.reader(fh, delimiter='\t') for row in reader: sam = row[0] ratio = row[1] freemix = row[2] workflow_inputs['crams_to_downsample'].append( { 'cram': {'class': 'File', 'path': sam}, 'downsample_ratio': ratio, 'contamination': freemix } ) input_yaml_path = os.path.join(output_dir, 'inputs.yaml') with open(input_yaml_path, 'w') as yaml_fh: yaml.dump(workflow_inputs, yaml_fh) config_template = os.path.join(LaimsApp().share_dir, 'cromwell.config.jinja') fs_loader = FileSystemLoader(searchpath = os.path.join(LaimsApp().share_dir)) env = Environment(loader=fs_loader, autoescape=True) template = env.get_template('cromwell.config.jinja') cromwell_config_path = os.path.join(output_dir, 'cromwell.config') template.stream( log_dir=log_dir, output_dir=output_dir, lsf_queue=app.queue, lsf_job_group=app.job_group, ).dump(cromwell_config_path) cmd = [ '/usr/bin/java', '-Dconfig.file=' + cromwell_config_path, '-Xmx24g', '-jar', '/opt/cromwell.jar', 'run', '-t', 'cwl', '-i', input_yaml_path, 'https://raw.githubusercontent.com/tmooney/cancer-genomics-workflow/downsample_and_recall/definitions/pipelines/gathered_downsample_and_recall.cwl' #TODO get a more canonical URL once things are merged ] job_runner.launch(cmd)