def create_plink_binary_tasks(self, parent_tasks): tasks = [] stage = '2-plink-binaries' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): chrom = ptask.params['chrom'] output_path = os.path.join(basedir, chrom, 'c{}'.format(chrom)) task = { 'func' : plink_binary, 'params' : { 'in_vcf' : ptask.params['out_vcf'], 'out_path' : output_path, 'chrom' : chrom, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(plink_binary_lsf_params(email)), 'parents' : [ptask], } tasks.append( self.workflow.add_task(**task) ) return tasks
def create_plink_extract_prune_tasks(self, parent_tasks): tasks = [] stage = '4-plink-extract-prune' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): plink_extract_file = "{}.prune.in".format(ptask.params['out_path']) orig_binary_data = ptask.params['in_path'] chrom = ptask.params['chrom'] output_path = os.path.join(basedir, chrom, 'c{}.extracted'.format(chrom)) task = { 'func' : plink_extract_prune, 'params' : { 'in_path' : orig_binary_data, 'in_extract' : plink_extract_file, 'out_path' : output_path, 'chrom' : chrom, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(plink_extract_prune_lsf_params(email)), 'parents' : [ptask], } tasks.append( self.workflow.add_task(**task) ) return tasks
def create_plink_merge_prune_file_task(self, parent_tasks): stage = '5-plink-merge-prune-files' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email parent_tasks_sorted = sorted(parent_tasks, key=lambda t: t.id) first_task = parent_tasks_sorted[0] remaining_tasks = parent_tasks_sorted[1:] merge_list_file = os.path.join(basedir, 'allfiles.txt') self._create_merge_list(merge_list_file, remaining_tasks) output_path = os.path.join(basedir, 'merged') task = { 'func' : plink_merge_pruned_files, 'params' : { 'in_ref' : first_task.params['out_path'], 'in_merge_file' : merge_list_file, 'out_path' : output_path, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(plink_merge_pruned_files_lsf_params(email)), 'parents' : parent_tasks_sorted, } merge_task = self.workflow.add_task(**task) return merge_task
def create_eigenstrat_smartpca_task(self, parent_task): stage = '6-eigenstrat-smartpca' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email ped_file = "{}.ped".format(parent_task.params['out_path']) map_file = "{}.map".format(parent_task.params['out_path']) task = { 'func' : eigenstrat_smartpca_analysis, 'params' : { 'in_ped_file' : ped_file, 'in_map_file' : map_file, 'out_prj_dir' : basedir, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(eigenstrat_smartpca_analysis_lsf_params(email)), 'parents' : [ parent_task ], } eigenstrat_task = self.workflow.add_task(**task) return eigenstrat_task
def create_filter_biallelic_snps_tasks(self): tasks = [] stage = '1-filter-biallelic-snps' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'filtered.snps.c{chrom}.vcf.gz'.format(chrom=chrom) task = { 'func' : filter_biallelic_snps, 'params' : { 'chrom' : chrom, 'in_vcf' : vcf, 'out_vcf' : os.path.join(basedir, chrom, output_vcf), 'in_min_vqslod' : self.config.vqslod_threshold, }, 'stage_name' : stage, 'uid' : '{chrom}'.format(chrom=chrom), 'drm_params' : to_json(filter_biallelic_snps_lsf_params(email)), } tasks.append( self.workflow.add_task(**task) ) return tasks
def create_bcftools_stats_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('bcftools-stats', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(bcftools_stats_lsf_params, self.config) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_stats = '{}.stats.out'.format(chrom) task = { 'func': bcftools_stats, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_stats': os.path.join(basedir, chrom, output_stats), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_remove_ac_0_tasks(self, step_number): tasks = [] stage = self._construct_task_name('select-variants-ac-0-removal', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params( gatk_select_variants_remove_ac_0_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'select-variants-chrom-{}-gatk.log'.format(chrom) task = { 'func': gatk_select_variants_remove_ac_0, 'params': { 'in_chrom': chrom, 'in_vcf': vcf, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, } tasks.append(self.workflow.add_task(**task)) return tasks
def create_count_sample_missingness_tasks(self, step_number): tasks = [] stage = self._construct_task_name('count-sample-missingness', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(count_sample_missingness_lsf_params, self.config) lsf_params_json = to_json(lsf_params) for chrom in self.config.chroms: # only count missing genotypes on chromosomes 1-22 (not X, Y, or MT) chrom_number = get_chrom_number(chrom) if not chrom_number.isdigit(): continue output_json = '{chrom}-sample-missingness-counts.json'.format( chrom=chrom) output_log = '{}-sample-missingness-counts.log'.format(chrom) task = { 'func': count_sample_missingness, 'params': { 'in_vcf': self.config.vcfs[chrom], 'in_chrom': chrom, 'out_json': os.path.join(basedir, chrom, output_json), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, } tasks.append(self.workflow.add_task(**task)) return tasks
def create_speedseq_realign_tasks(self): tasks = [] stage = '1-exec-speedseq-realign' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email lsf_job_group = self.config.drm_job_group sample_data = self.config.sample_data for sample_id in sample_data.keys(): bam_paths = sample_data[sample_id]['bams'] sample_name = sample_data[sample_id]['meta']['original-name'] output_prefix = os.path.join(basedir, sample_id, "{}.b38.realign".format(sample_id)) tmpdir = os.path.join(basedir, sample_id, 'tmpdir') input_bams = ' '.join(bam_paths) task = { 'func' : exec_speedseq, 'params' : { 'output_prefix' : output_prefix, 'tmpdir' : tmpdir, 'input_bams' : input_bams, }, 'stage_name' : stage, 'uid' : sample_id, 'drm_params' : to_json(exec_speedseq_lsf_params(email, lsf_job_group)), } tasks.append( self.workflow.add_task(**task) ) return tasks
def create_calculate_sample_missingness_task(self, parent_tasks, step_number): stage = self._construct_task_name('calculate-sample-missingness', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) input_json_wildcard_path = os.path.join(input_dir, '*', '*.json') lsf_params = get_lsf_params(calculate_sample_missingness_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) task = { 'func': calculate_sample_missingness, 'params': { 'in_json': input_json_wildcard_path, 'out_stats': os.path.join(output_dir, 'sample-missingness-pct.dat'), 'out_log': os.path.join(output_dir, 'sample-missingness-pct.dat.log'), }, 'stage_name': stage, 'uid': '1-22', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task
def create_remove_symbolic_deletion_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('remove-symbolic-alleles', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params( remove_symbolic_deletion_alleles_lsf_params, self.config.email, self.config.docker) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'remove-symbolic-alleles-chrom-{}.log'.format(chrom) task = { 'func': remove_symbolic_deletion_alleles, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_decompose_normalize_unique_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('decompose-normalize-uniq', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(normalize_decompose_unique_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'decompose-normalize-unique-{}.log'.format(chrom) task = { 'func': normalize_decompose_unique, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_variant_eval_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('gatk-variant-eval', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(gatk_variant_eval_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_stats = 'chrom-{}-variant-eval.out'.format(chrom) output_log = 'chrom-{}-variant-eval.log'.format(chrom) task = { 'func': gatk_variant_eval, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_stats': os.path.join(basedir, chrom, output_stats), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_1000G_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('annotate-w-1000G', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotation_1000G_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = '1kg-annotated.c{}.vcf.gz'.format(chrom) output_log = '1000G-annotate.{}.log'.format(chrom) task = { 'func': annotation_1000G, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_filter_variant_missingness_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('filter-variant-missingness', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(filter_variant_missingness_lsf_params, self.config) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom) output_log = 'filter-missingness-{}.log'.format(chrom) task = { 'func': filter_variant_missingness, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_variant_eval_summary_task(self, parent_tasks, step_number): stage = self._construct_task_name('gatk-variant-eval-summary', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) lsf_params = get_lsf_params(variant_eval_summary_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) task = { 'func': variant_eval_summary, 'params': { 'in_dir': input_dir, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': 'all-chroms', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task
def create_LCR_annotation_tasks(self, parent_tasks, step_number): tasks = [] stage = self._construct_task_name('Low-Confidence-Region-annotation', step_number) basedir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(annotation_LCR_lsf_params, self.config) lsf_params_json = to_json(lsf_params) for ptask in parent_tasks: chrom = ptask.params['in_chrom'] output_vcf = 'b38.LCR.annotated.c{}.vcf.gz'.format(chrom) output_log = 'LCR.annotation.{}.log'.format(chrom) task = { 'func': annotation_LCR, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_chrom': chrom, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'out_log': os.path.join(basedir, chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=chrom), 'drm_params': lsf_params_json, 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_data_frame_task(self, parent_task): stage = '7-make-data-frame' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email pca_evec_file = os.path.join( parent_task.params['out_prj_dir'], 'merged.eigenstrat.pca.evec', ) out_file = os.path.join(basedir, 'merged.eigenstrat.pca.evec.tsv') task = { 'func' : create_evec_data_frame, 'params' : { 'in_file' : pca_evec_file, 'out_file' : out_file, }, 'stage_name' : stage, 'uid' : 'all-chroms', 'drm_params' : to_json(create_evec_data_frame_lsf_params(email)), 'parents' : [ parent_task ], } df_task = self.workflow.add_task(**task) return df_task
def create_bcftools_stats_summary_task(self, parent_tasks, step_number): stage = self._construct_task_name('bcftools-stats-summary', step_number) output_dir = os.path.join(self.config.rootdir, stage) prior_stage_name = parent_tasks[0].stage.name input_dir = os.path.join(self.config.rootdir, prior_stage_name) lsf_params = get_lsf_params(bcftools_stats_summary_lsf_params, self.config) lsf_params_json = to_json(lsf_params) task = { 'func': bcftools_stats_summary, 'params': { 'in_dir': input_dir, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': 'all-chroms', 'drm_params': lsf_params_json, 'parents': parent_tasks, } summary_task = self.workflow.add_task(**task) return summary_task
def create_aggregate_mie_stats_tasks(self, parent_tasks): tasks = [] stage = '3-aggregate-mie-stats' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email input_dir = os.path.join(self.config.rootdir, parent_tasks[0].stage.name) parent_snp_tranche_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'snps' and task.params['method'] == 'tranche') ] parent_snp_percentile_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'snps' and task.params['method'] == 'percentile') ] parent_indel_tranche_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'indels' and task.params['method'] == 'tranche') ] parent_indel_percentile_tasks = \ [task for task in parent_tasks \ if (task.params['type'] == 'indels' and task.params['method'] == 'percentile') ] task_groups = (parent_snp_tranche_tasks, parent_snp_percentile_tasks, parent_indel_tranche_tasks, parent_indel_percentile_tasks) for tgroup in task_groups: category = tgroup[0].params['type'] method = tgroup[0].params['method'] out_filename = '.'.join([category, method, 'tsv']) output_file = os.path.join(basedir, out_filename) task = { 'func': aggregate_mie_statistics, 'params': { 'in_category': category, 'in_method': method, 'in_dir': input_dir, 'out_file': output_file, }, 'stage_name': stage, 'uid': '{category}:{method}'.format(method=method, category=category), 'drm_params': to_json(aggregate_mie_statistics_lsf_params(email)), 'parents': tgroup, } tasks.append(self.workflow.add_task(**task))
def create_concatenate_vcfs_task(self, parent_tasks, step_number): tasks = list() stage = self._construct_task_name('concat-vcfs', step_number) output_dir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(concatenate_vcfs_lsf_params, self.config.email, self.config.docker, self.config.drm_queue) lsf_params_json = to_json(lsf_params) def region_key(task): reference_fai = os.path.join( '/gscmnt/ams1102/info/model_data/2869585698/build106942997', 'all_sequences.fa.fai') return Region(all_sequences.fa.fai, task.params['in_chrom']) def chromosome_key(task): reference_fai = os.path.join( '/gscmnt/ams1102/info/model_data/2869585698/build106942997', 'all_sequences.fa.fai') return Region(reference_fai, task.params['in_chrom']).chrom for ref_chrom, chrom_tasks in groupby(sorted(parent_tasks, key=region_key), key=chromosome_key): ptasks = list(chrom_tasks) input_vcfs = [x.params['out_vcf'] for x in ptasks] output_vcf = 'concatenated.c{}.vcf.gz'.format(ref_chrom) output_log = 'concatenate.{}.log'.format(ref_chrom) task = { 'func': concatenate_vcfs, 'params': { 'in_vcfs': input_vcfs, 'in_chrom': ref_chrom, 'out_vcf': os.path.join(output_dir, ref_chrom, output_vcf), 'out_log': os.path.join(output_dir, ref_chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=ref_chrom), 'drm_params': lsf_params_json, 'parents': ptasks, } tasks.append(self.workflow.add_task(**task)) return tasks
def create_plink_pipeline_tasks(self, parent_tasks): tasks = [] stage = '2-plink-pipeline' basedir = os.path.join(self.config.rootdir, stage) email = self.config.email for ptask in sorted(parent_tasks, key=lambda t: t.id): chrom = ptask.params['in_chrom'] label = ptask.params['in_label'] method = ptask.params['in_method'] category = ptask.params['in_type'] output_dir = os.path.join(basedir, category, method, label, chrom) # ensure_directory(output_dir) task = { 'func': plink_pipeline, 'params': { 'in_vcf': ptask.params['out_vcf'], 'in_trio_fam': self.config.plink_fam_file, 'chrom': chrom, 'type': category, 'method': method, 'chrom': chrom, 'label': label, 'out_dir': output_dir, }, 'stage_name': stage, 'uid': '{category}:{method}:{label}:{chrom}'.format(chrom=chrom, method=method, category=category, label=label), 'drm_params': to_json(plink_pipeline_lsf_params(email)), 'parents': [ptask], } tasks.append(self.workflow.add_task(**task)) return tasks
def create_concatenate_vcfs_task(self, parent_tasks, step_number): tasks = list() stage = self._construct_task_name('concat-vcfs', step_number) output_dir = os.path.join(self.config.rootdir, stage) lsf_params = get_lsf_params(concatenate_vcfs_lsf_params, self.config) lsf_params_json = to_json(lsf_params) def region_key(task): reference_fai = '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa.fai' return Region(reference_fai, task.params['in_chrom']) def chromosome_key(task): reference_fai = '/gscmnt/gc2802/halllab/ccdg_resources/genomes/human/GRCh38DH/all_sequences.fa.fai' return Region(reference_fai, task.params['in_chrom']).chrom for ref_chrom, chrom_tasks in groupby(sorted(parent_tasks, key=region_key), key=chromosome_key): ptasks = list(chrom_tasks) input_vcfs = [x.params['out_vcf'] for x in ptasks] output_vcf = 'concatenated.c{}.vcf.gz'.format(ref_chrom) output_log = 'concatenate.{}.log'.format(ref_chrom) task = { 'func': concatenate_vcfs, 'params': { 'in_vcfs': input_vcfs, 'in_chrom': ref_chrom, 'out_vcf': os.path.join(output_dir, ref_chrom, output_vcf), 'out_log': os.path.join(output_dir, ref_chrom, output_log), }, 'stage_name': stage, 'uid': '{chrom}'.format(chrom=ref_chrom), 'drm_params': lsf_params_json, 'parents': ptasks, } tasks.append(self.workflow.add_task(**task)) return tasks
def create_vcf_partition_chromosome_tasks(self, method, label, category, interval): tasks = [] stage = '1-partition-vcfs' basedir = os.path.join(self.config.rootdir, stage, category, method, label) email = self.config.email for chrom in self.config.chroms: vcf = self.config.vcfs[chrom] output_vcf = 'selected.c{chrom}.vcf.gz'.format(chrom=chrom) task = { 'func': vcf_partition, 'params': { 'in_vcf': vcf, 'out_vcf': os.path.join(basedir, chrom, output_vcf), 'in_min_vqslod': interval[0], 'in_max_vqslod': interval[1], 'in_samples': self.config.control_samples_file, 'in_type': category, 'in_method': method, 'in_chrom': chrom, 'in_label': label, }, 'stage_name': stage, 'uid': '{category}:{method}:{label}:{chrom}'.format(chrom=chrom, method=method, category=category, label=label), 'drm_params': to_json(vcf_partition_lsf_params(email)), } tasks.append(self.workflow.add_task(**task)) return tasks