default='local', help='', choices=('local', 'drmaa:ge', 'ge', 'slurm')) p.add_argument('-q', '--queue', help='Submit to this queue of the DRM supports it') args = p.parse_args() cosmos = Cosmos( 'sqlite:///%s/sqlite.db' % os.path.dirname(os.path.abspath(__file__)), # example of how to change arguments if you're NOT using default_drm='local' get_submit_args=partial(default_get_submit_args, parallel_env='smp'), default_drm=args.drm, default_queue=args.queue) cosmos.initdb() sp.check_call('mkdir -p analysis_output/ex2', shell=True) os.chdir('analysis_output/ex2') workflow = cosmos.start('Example2', restart=True, skip_confirm=True) recipe(workflow) workflow.make_output_dirs() workflow.run(max_cores=10) # Noting here that if you wanted to look at the outputs of any Tasks to decide how to generate the rest of a DAG # you can do so here, proceed to add more tasks via workflow.add_task(), and then call workflow.run() again. # Yes, it does require running all Tasks in the dag to get the outputs of any Task, and we hope to address # that limitation at some point in the future.
import subprocess as sp import os import sys from cosmos.api import Cosmos cosmos = Cosmos('sqlite:///%s/sqlite.db' % os.path.dirname(os.path.abspath(__file__)), default_drm='local') cosmos.initdb() sp.check_call('mkdir -p analysis_output/ex1', shell=True) os.chdir('analysis_output/ex1') workflow = cosmos.start('Example1', restart=True, skip_confirm=True) def say(text, out_file): return r""" echo "{text}" > {out_file} """.format(text=text, out_file=out_file) t = workflow.add_task(func=say, params=dict(text='Hello World', out_file='out.txt',), uid='my_task', time_req=None, core_req=1, mem_req=1024) print('task.params', t.params) print('task.input_map', t.input_map) print('task.output_map', t.output_map) print('task.core_req', t.core_req) print('task.time_req', t.time_req) print('task.drm', t.drm) print('task.uid', t.uid)
class Pipeline(object): def __init__(self, config, drm, restart, skip_confirm): self.config = config self.cosmos = Cosmos(database_url='sqlite:///{}'.format( self.config.db), get_submit_args=default_get_submit_args, default_drm=drm) self.cosmos.initdb() primary_logfile = os.path.join( self.config.rootdir, '{}.log'.format(self.config.project_name), ) self.workflow = self.cosmos.start( self.config.project_name, primary_log_path=primary_logfile, restart=restart, skip_confirm=skip_confirm, ) self.setup_pipeline() def setup_pipeline(self): self.construct_pipeline() self.workflow.make_output_dirs() def run(self, task_flush): # put set_successful to False if you intend to add more tasks to the # pipeline later custom_log_dir = lambda task: os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid) self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir, db_task_flush=task_flush) def construct_pipeline(self): # 1. remove unused alternates remove_ac_0_tasks = self.create_remove_ac_0_tasks(1) # 2. calculate sample missingness (counting phase) count_sample_missingness_tasks = self.create_count_sample_missingness_tasks( remove_ac_0_tasks, 2) # 2.1 calculate sample missingness (merge and calculation phase) calculate_sample_missingness_task = self.create_calculate_sample_missingness_task( count_sample_missingness_tasks, 2.1) # 3. denormalize, decompose, and uniq dnu_tasks = self.create_decompose_normalize_unique_tasks( remove_ac_0_tasks, 3) # 4. remove symbolic alleles rsa_tasks = self.create_remove_symbolic_deletion_tasks(dnu_tasks, 4) # 5. filter missingness filter_variant_missingness_tasks = self.create_filter_variant_missingness_tasks( rsa_tasks, 5) # 6. annotate allele balances allele_balance_annotation_tasks = self.create_allele_balance_annotation_tasks( filter_variant_missingness_tasks, 6) # 7. annotate with 1000G annotate_1000G_tasks = self.create_1000G_annotation_tasks( allele_balance_annotation_tasks, 7) # 8. annotate with ExAC annotate_ExAC_tasks = self.create_ExAC_annotation_tasks( annotate_1000G_tasks, 8) # 9. VEP annotation annotate_vep_cadd_tasks = self.create_vep_cadd_annotation_tasks( annotate_ExAC_tasks, 9) # 10. VCF concatenation concatenated_vcfs = self.create_concatenate_vcfs_task( annotate_vep_cadd_tasks, 10) # 11. bcftools stats bcftools_stats_tasks = self.create_bcftools_stats_tasks( annotate_ExAC_tasks, 11) # 11.1 Merge & Plot bcftools stats bcftools_stats_summary_task = self.create_bcftools_stats_summary_task( bcftools_stats_tasks, 11.1) # 12. GATK VariantEval variant_eval_tasks = self.create_variant_eval_tasks( annotate_ExAC_tasks, 12) # 12.1. Merge & Plot GATK VariantEval Stats variant_eval_summary_task = self.create_variant_eval_summary_task( variant_eval_tasks, 12.1) def create_bcftools_stats_summary_task(self, parent_tasks, step_number): stage = self._construct_task_name('bcftools-stats-summary', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) lsf_params = get_lsf_params(bcftools_stats_summary_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) task = { 'func': bcftools_stats_summary, 'params': { 'in_dir': input_dir, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': 'all-chroms', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task def create_concatenate_vcfs_task(self, parent_tasks, step_number): tasks = list() stage = self._construct_task_name('concat-vcfs', step_number) output_dir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(concatenate_vcfs_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) def region_key(task): reference_fai = os.path.join( '/gscmnt/ams1102/info/model_data/2869585698/build106942997', 'all_sequences.fa.fai') return Region(all_sequences.fa.fai, task.params['in_chrom']) def chromosome_key(task): reference_fai = os.path.join( '/gscmnt/ams1102/info/model_data/2869585698/build106942997', 'all_sequences.fa.fai') return Region(reference_fai, task.params['in_chrom']).chrom for ref_chrom, chrom_tasks in groupby(sorted(parent_tasks, key=region_key), key=chromosome_key): ptasks = list(chrom_tasks) input_vcfs = [x.params['out_vcf'] for x in ptasks] output_vcf = 'concatenated.c{}.vcf.gz'.format(ref_chrom) output_log = 'concatenate.{}.log'.format(ref_chrom) task = { 'func': concatenate_vcfs, 'params': { 'in_vcfs': input_vcfs, 'in_chrom': ref_chrom, 'out_vcf': os.path.join(output_dir, ref_chrom, output_vcf), 'out_log': os.path.join(output_dir, ref_chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=ref_chrom), 'drm_params': lsf_params_json, 'parents': ptasks, } tasks.append(self.workflow.add_task(**task)) return tasks def create_variant_eval_summary_task(self, parent_tasks, step_number): stage = self._construct_task_name('gatk-variant-eval-summary', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) lsf_params = get_lsf_params(variant_eval_summary_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) task = { 'func': variant_eval_summary, 'params': { 'in_dir': input_dir, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': 'all-chroms', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task def create_bcftools_stats_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('bcftools-stats', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(bcftools_stats_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_stats = '{}.stats.out'.format(chrom) task = { 'func': bcftools_stats, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_stats': os.path.join(basedir, chrom, output_stats), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_variant_eval_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('gatk-variant-eval', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(gatk_variant_eval_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_stats = 'chrom-{}-variant-eval.out'.format(chrom) output_log = 'chrom-{}-variant-eval.log'.format(chrom) task = { 'func': gatk_variant_eval, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_stats': os.path.join(basedir, chrom, output_stats), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_vep_cadd_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('vep-cadd-annotation', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotation_vep_cadd_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'annotated.vep.cadd.c{}.vcf.gz'.format(chrom) output_log = 'vep.cadd.annotation.{}.log'.format(chrom) task = { 'func': annotation_vep_cadd, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_ExAC_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('annotate-w-ExAC', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotation_ExAC_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'ExAC-annotated.c{}.vcf.gz'.format(chrom) output_log = 'ExAC-annotate.{}.log'.format(chrom) task = { 'func': annotation_ExAC, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_1000G_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('annotate-w-1000G', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotation_1000G_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = '1kg-annotated.c{}.vcf.gz'.format(chrom) output_log = '1000G-annotate.{}.log'.format(chrom) task = { 'func': annotation_1000G, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_allele_balance_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('allele-balance-annotation', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotate_allele_balances_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'allele-balance-{}.log'.format(chrom) task = { 'func': annotate_allele_balances, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_filter_variant_missingness_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('filter-variant-missingness', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(filter_variant_missingness_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'filter-missingness-{}.log'.format(chrom) task = { 'func': filter_variant_missingness, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_remove_symbolic_deletion_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('remove-symbolic-alleles', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params( remove_symbolic_deletion_alleles_lsf_params, self.config.email, self.config.docker) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'remove-symbolic-alleles-chrom-{}.log'.format(chrom) task = { 'func': remove_symbolic_deletion_alleles, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_decompose_normalize_unique_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('decompose-normalize-uniq', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(normalize_decompose_unique_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'decompose-normalize-unique-{}.log'.format(chrom) task = { 'func': normalize_decompose_unique, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_calculate_sample_missingness_task(self, parent_tasks, step_number): stage = self._construct_task_name('calculate-sample-missingness', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) input_json_wildcard_path = os.path.join(input_dir, '*', '*.json') lsf_params = get_lsf_params(calculate_sample_missingness_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) task = { 'func': calculate_sample_missingness, 'params': { 'in_json': input_json_wildcard_path, 'out_stats': os.path.join(output_dir, 'sample-missingness-pct.dat'), 'out_log': os.path.join(output_dir, 'sample-missingness-pct.dat.log'), }, 'stage_name': stage, 'uid': '1-22', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task def create_count_sample_missingness_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('count-sample-missingness', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(count_sample_missingness_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] # only count missing genotypes on chromosomes 1-22 (not X, Y, or MT) if not chrom[0].isdigit(): continue output_json = '{chrom}-sample-missingness-counts.json'.format( chrom=chrom) output_log = '{}-sample-missingness-counts.log'.format(chrom) task = { 'func': count_sample_missingness, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_json': os.path.join(basedir, chrom, output_json), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_remove_ac_0_tasks(self, step_number): tasks = [] stage = self._construct_task_name('select-variants-ac-0-removal', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params( gatk_select_variants_remove_ac_0_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'select-variants-chrom-{}-gatk.log'.format(chrom) task = { 'func': gatk_select_variants_remove_ac_0, 'params': { 'in_chrom': chrom, 'in_vcf': vcf, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, } tasks.append(self.workflow.add_task(**task)) return tasks def _construct_task_name(self, name, number): task_name = '{}-{}'.format(number, name) return task_name
def main(): args = parse_args() cosmos = Cosmos( "sqlite:///%s/sqlite.db" % os.path.dirname(os.path.abspath(__file__)), default_drm="awsbatch", default_drm_options=dict( container_image=args.container_image, s3_prefix_for_command_script_temp_files=args.s3_prefix_for_command_script_temp_files, # only retry on spot instance death retry_only_if_status_reason_matches="Host EC2 .+ terminated.", ), default_queue=args.default_queue, ) cosmos.initdb() # sp.check_call("mkdir -p analysis_output/ex1", shell=True) # os.chdir("analysis_output/ex1") workflow = cosmos.start(f"Evaluate_{args.id}", restart=True, skip_confirm=True) parameters = np.load(f"optimize_awsbatch/parameters/{args.id}.npy") for i, par in enumerate(parameters): parameters_ = dict( mean_weight=par[0], c_w=par[1], tau_pos=par[2], tau_neg=par[3], A_pos=par[4], A_neg=par[5], weight_decay=par[6], n_filters=25, time_max=250, crop=20, kernel_size=16, stride=4, intensity=127.5, c_w_min=None, c_l=True, network_type="LC_SNN", ) workflow.add_task( func=evaluate, params=dict( parameters=parameters_, out_s3_uri=f"{args.out_s3_uri}/scores/{args.id}/{i}.json", sleep=args.sleep, train=args.train, calibrate=args.calibrate, test=args.test ), uid=str(i), time_req=None, max_attempts=args.max_attempts, core_req=args.core_req, mem_req=args.mem_req, ) workflow.run() sys.exit(0 if workflow.successful else 1)
class Pipeline(object): def __init__(self, config, drm, restart): self.config = config self.cosmos = Cosmos(database_url='sqlite:///{}'.format( self.config.db), get_submit_args=default_get_submit_args, default_drm=drm) self.cosmos.initdb() primary_logfile = os.path.join( self.config.rootdir, '{}.log'.format(self.config.project_name), ) self.workflow = self.cosmos.start( self.config.project_name, primary_log_path=primary_logfile, restart=restart, ) self.setup_pipeline() def setup_pipeline(self): self.construct_pipeline() self.workflow.make_output_dirs() def run(self): # put set_successful to False if you intend to add more tasks to the # pipeline later custom_log_dir = lambda task: os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid) self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir) def construct_pipeline(self): partition_tasks = self.create_vcf_partition_tasks() plink_pipeline_tasks = self.create_plink_pipeline_tasks( partition_tasks) aggregate_mie_stats_tasks = self.create_aggregate_mie_stats_tasks( plink_pipeline_tasks) def create_aggregate_mie_stats_tasks(self, parent_tasks): tasks = [] stage = '3-aggregate-mie-stats' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email input_dir = os.path.join(self.config.rootdir, parent_tasks[0].stage.name) parent_snp_tranche_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'snps' and task.params['method'] == 'tranche') ] parent_snp_percentile_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'snps' and task.params['method'] == 'percentile') ] parent_indel_tranche_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'indels' and task.params['method'] == 'tranche') ] parent_indel_percentile_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'indels' and task.params['method'] == 'percentile') ] task_groups = (parent_snp_tranche_tasks, parent_snp_percentile_tasks, parent_indel_tranche_tasks, parent_indel_percentile_tasks) for tgroup in task_groups: category = tgroup[0].params['type'] method = tgroup[0].params['method'] out_filename = '.'.join([category, method, 'tsv']) output_file = os.path.join(basedir, out_filename) task = { 'func': aggregate_mie_statistics, 'params': { 'in_category': category, 'in_method': method, 'in_dir': input_dir, 'out_file': output_file, }, 'stage_name': stage, 'uid': '{category}:{method}'.format(method=method, category=category), 'drm_params': to_json(aggregate_mie_statistics_lsf_params(email)), 'parents': tgroup, } tasks.append(self.workflow.add_task(**task)) def create_plink_pipeline_tasks(self, parent_tasks): tasks = [] stage = '2-plink-pipeline' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): chrom = ptask.params['in_chrom'] label = ptask.params['in_label'] method = ptask.params['in_method'] category = ptask.params['in_type'] output_dir = os.path.join(basedir, category, method, label, chrom) # ensure_directory(output_dir) task = { 'func': plink_pipeline, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_trio_fam': self.config.plink_fam_file, 'chrom': chrom, 'type': category, 'method': method, 'chrom': chrom, 'label': label, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': '{category}:{method}:{label}:{chrom}'.format(chrom=chrom, method=method, category=category, label=label), 'drm_params': to_json(plink_pipeline_lsf_params(email)), 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks def create_vcf_partition_tasks(self): all_tasks = [] cases = ( ('tranche', 'snps', self.config.tranche_intervals['snps']), ('tranche', 'indels', self.config.tranche_intervals['indels']), ('percentile', 'snps', self.config.percentiles['snps']), ('percentile', 'indels', self.config.percentiles['indels']), ) for case in cases: tasks = self.generate_vcf_partition_tasks(*case) all_tasks.extend(tasks) return all_tasks def generate_vcf_partition_tasks(self, method, category, intervals): # method: 'tranche' or 'percentile' # category: 'snps' or 'indels' # label: # tranche : 1, 2 or 3 # percentile : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100 tasks = [] for label in sorted(intervals.keys()): interval = intervals[label] partition_tasks = self.create_vcf_partition_chromosome_tasks( method=method, label=str(label), category=category, interval=interval, ) tasks.extend(partition_tasks) return tasks def create_vcf_partition_chromosome_tasks(self, method, label, category, interval): tasks = [] stage = '1-partition-vcfs' basedir = os.path.join(self.config.rootdir, stage, category, method, label) email = self.config.email for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'selected.c{chrom}.vcf.gz'.format(chrom=chrom) task = { 'func': vcf_partition, 'params': { 'in_vcf': vcf, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'in_min_vqslod': interval[0], 'in_max_vqslod': interval[1], 'in_samples': self.config.control_samples_file, 'in_type': category, 'in_method': method, 'in_chrom': chrom, 'in_label': label, }, 'stage_name': stage, 'uid': '{category}:{method}:{label}:{chrom}'.format(chrom=chrom, method=method, category=category, label=label), 'drm_params': to_json(vcf_partition_lsf_params(email)), } tasks.append(self.workflow.add_task(**task)) return tasks
class Pipeline(object): def __init__(self, config, drm, restart): self.config = config self.cosmos = Cosmos( database_url='sqlite:///{}'.format(self.config.db), get_submit_args=default_get_submit_args, default_drm=drm ) self.cosmos.initdb() primary_logfile = os.path.join( self.config.rootdir, '{}.log'.format(self.config.project_name), ) self.workflow = self.cosmos.start( self.config.project_name, primary_log_path=primary_logfile, restart=restart, ) self.setup_pipeline() def setup_pipeline(self): self.construct_pipeline() self.workflow.make_output_dirs() def run(self): # put set_successful to False if you intend to add more tasks to the # pipeline later custom_log_dir = lambda task : os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid) self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir) def construct_pipeline(self): filter_biallelic_snps_tasks = self.create_filter_biallelic_snps_tasks() plink_binary_tasks = self.create_plink_binary_tasks(filter_biallelic_snps_tasks) plink_ld_prune_tasks = self.create_plink_ld_prune_tasks(plink_binary_tasks) plink_extract_prune_tasks = self.create_plink_extract_prune_tasks(plink_ld_prune_tasks) plink_merge_prune_files_task = self.create_plink_merge_prune_file_task(plink_extract_prune_tasks) eigenstrat_task = self.create_eigenstrat_smartpca_task(plink_merge_prune_files_task) data_frame_task = self.create_data_frame_task(eigenstrat_task) def create_data_frame_task(self, parent_task): stage = '7-make-data-frame' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email pca_evec_file = os.path.join( parent_task.params['out_prj_dir'], 'merged.eigenstrat.pca.evec', ) out_file = os.path.join(basedir, 'merged.eigenstrat.pca.evec.tsv') task = { 'func' : create_evec_data_frame, 'params' : { 'in_file' : pca_evec_file, 'out_file' : out_file, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(create_evec_data_frame_lsf_params(email)), 'parents' : [ parent_task ], } df_task = self.workflow.add_task(**task) return df_task def create_eigenstrat_smartpca_task(self, parent_task): stage = '6-eigenstrat-smartpca' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email ped_file = "{}.ped".format(parent_task.params['out_path']) map_file = "{}.map".format(parent_task.params['out_path']) task = { 'func' : eigenstrat_smartpca_analysis, 'params' : { 'in_ped_file' : ped_file, 'in_map_file' : map_file, 'out_prj_dir' : basedir, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(eigenstrat_smartpca_analysis_lsf_params(email)), 'parents' : [ parent_task ], } eigenstrat_task = self.workflow.add_task(**task) return eigenstrat_task def create_plink_merge_prune_file_task(self, parent_tasks): stage = '5-plink-merge-prune-files' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email parent_tasks_sorted = sorted(parent_tasks, key=lambda t: t.id) first_task = parent_tasks_sorted[0] remaining_tasks = parent_tasks_sorted[1:] merge_list_file = os.path.join(basedir, 'allfiles.txt') self._create_merge_list(merge_list_file, remaining_tasks) output_path = os.path.join(basedir, 'merged') task = { 'func' : plink_merge_pruned_files, 'params' : { 'in_ref' : first_task.params['out_path'], 'in_merge_file' : merge_list_file, 'out_path' : output_path, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(plink_merge_pruned_files_lsf_params(email)), 'parents' : parent_tasks_sorted, } merge_task = self.workflow.add_task(**task) return merge_task def _create_merge_list(self, merge_file, tasks): ensure_directory(os.path.dirname(merge_file)) with open(merge_file, 'w') as f: for t in tasks: print(t.params['out_path'], file=f) def create_plink_extract_prune_tasks(self, parent_tasks): tasks = [] stage = '4-plink-extract-prune' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): plink_extract_file = "{}.prune.in".format(ptask.params['out_path']) orig_binary_data = ptask.params['in_path'] chrom = ptask.params['chrom'] output_path = os.path.join(basedir, chrom, 'c{}.extracted'.format(chrom)) task = { 'func' : plink_extract_prune, 'params' : { 'in_path' : orig_binary_data, 'in_extract' : plink_extract_file, 'out_path' : output_path, 'chrom' : chrom, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(plink_extract_prune_lsf_params(email)), 'parents' : [ptask], } tasks.append( self.workflow.add_task(**task) ) return tasks def create_plink_ld_prune_tasks(self, parent_tasks): tasks = [] stage = '3-plink-ld-prune' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): chrom = ptask.params['chrom'] output_path = os.path.join(basedir, chrom, 'c{}-pruned'.format(chrom)) task = { 'func' : plink_ld_prune, 'params' : { 'in_path' : ptask.params['out_path'], 'out_path' : output_path, 'chrom' : chrom, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(plink_ld_prune_lsf_params(email)), 'parents' : [ptask], } tasks.append( self.workflow.add_task(**task) ) return tasks def create_plink_binary_tasks(self, parent_tasks): tasks = [] stage = '2-plink-binaries' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): chrom = ptask.params['chrom'] output_path = os.path.join(basedir, chrom, 'c{}'.format(chrom)) task = { 'func' : plink_binary, 'params' : { 'in_vcf' : ptask.params['out_vcf'], 'out_path' : output_path, 'chrom' : chrom, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(plink_binary_lsf_params(email)), 'parents' : [ptask], } tasks.append( self.workflow.add_task(**task) ) return tasks def create_filter_biallelic_snps_tasks(self): tasks = [] stage = '1-filter-biallelic-snps' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'filtered.snps.c{chrom}.vcf.gz'.format(chrom=chrom) task = { 'func' : filter_biallelic_snps, 'params' : { 'chrom' : chrom, 'in_vcf' : vcf, 'out_vcf' : os.path.join(basedir, chrom, output_vcf), 'in_min_vqslod' : self.config.vqslod_threshold, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(filter_biallelic_snps_lsf_params(email)), } tasks.append( self.workflow.add_task(**task) ) return tasks
class Pipeline(object): def __init__(self, config, drm, restart): self.config = config self.cosmos = Cosmos( database_url='sqlite:///{}'.format(self.config.db), get_submit_args=default_get_submit_args, default_drm=drm ) self.cosmos.initdb() primary_logfile = os.path.join( self.config.rootdir, '{}.log'.format(self.config.project_name), ) self.workflow = self.cosmos.start( self.config.project_name, primary_log_path=primary_logfile, restart=restart, ) self.setup_pipeline() def setup_pipeline(self): self.construct_pipeline() self.workflow.make_output_dirs() def run(self): # put set_successful to False if you intend to add more tasks to the # pipeline later custom_log_dir = lambda task : os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid) self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir) def construct_pipeline(self): speedseq_tasks = self.create_speedseq_realign_tasks() def create_speedseq_realign_tasks(self): tasks = [] stage = '1-exec-speedseq-realign' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email lsf_job_group = self.config.drm_job_group sample_data = self.config.sample_data for sample_id in sample_data.keys(): bam_paths = sample_data[sample_id]['bams'] sample_name = sample_data[sample_id]['meta']['original-name'] output_prefix = os.path.join(basedir, sample_id, "{}.b38.realign".format(sample_id)) tmpdir = os.path.join(basedir, sample_id, 'tmpdir') input_bams = ' '.join(bam_paths) task = { 'func' : exec_speedseq, 'params' : { 'output_prefix' : output_prefix, 'tmpdir' : tmpdir, 'input_bams' : input_bams, }, 'stage_name' : stage, 'uid' : sample_id, 'drm_params' : to_json(exec_speedseq_lsf_params(email, lsf_job_group)), } tasks.append( self.workflow.add_task(**task) ) return tasks