Exemple #1
0
                   default='local',
                   help='',
                   choices=('local', 'drmaa:ge', 'ge', 'slurm'))
    p.add_argument('-q',
                   '--queue',
                   help='Submit to this queue of the DRM supports it')

    args = p.parse_args()

    cosmos = Cosmos(
        'sqlite:///%s/sqlite.db' % os.path.dirname(os.path.abspath(__file__)),
        # example of how to change arguments if you're NOT using default_drm='local'
        get_submit_args=partial(default_get_submit_args, parallel_env='smp'),
        default_drm=args.drm,
        default_queue=args.queue)
    cosmos.initdb()

    sp.check_call('mkdir -p analysis_output/ex2', shell=True)
    os.chdir('analysis_output/ex2')

    workflow = cosmos.start('Example2', restart=True, skip_confirm=True)

    recipe(workflow)

    workflow.make_output_dirs()
    workflow.run(max_cores=10)

    # Noting here that if you wanted to look at the outputs of any Tasks to decide how to generate the rest of a DAG
    # you can do so here, proceed to add more tasks via workflow.add_task(), and then call workflow.run() again.
    # Yes, it does require running all Tasks in the dag to get the outputs of any Task, and we hope to address
    # that limitation at some point in the future.
Exemple #2
0
import subprocess as sp
import os
import sys
from cosmos.api import Cosmos

cosmos = Cosmos('sqlite:///%s/sqlite.db' % os.path.dirname(os.path.abspath(__file__)),
                default_drm='local')
cosmos.initdb()

sp.check_call('mkdir -p analysis_output/ex1', shell=True)
os.chdir('analysis_output/ex1')
workflow = cosmos.start('Example1', restart=True, skip_confirm=True)


def say(text, out_file):
    return r"""
        echo "{text}" > {out_file}
    """.format(text=text, out_file=out_file)


t = workflow.add_task(func=say,
                      params=dict(text='Hello World', out_file='out.txt',),
                      uid='my_task', time_req=None, core_req=1, mem_req=1024)

print('task.params', t.params)
print('task.input_map', t.input_map)
print('task.output_map', t.output_map)
print('task.core_req', t.core_req)
print('task.time_req', t.time_req)
print('task.drm', t.drm)
print('task.uid', t.uid)
Exemple #3
0
class Pipeline(object):
    def __init__(self, config, drm, restart, skip_confirm):
        self.config = config

        self.cosmos = Cosmos(database_url='sqlite:///{}'.format(
            self.config.db),
                             get_submit_args=default_get_submit_args,
                             default_drm=drm)

        self.cosmos.initdb()

        primary_logfile = os.path.join(
            self.config.rootdir,
            '{}.log'.format(self.config.project_name),
        )

        self.workflow = self.cosmos.start(
            self.config.project_name,
            primary_log_path=primary_logfile,
            restart=restart,
            skip_confirm=skip_confirm,
        )

        self.setup_pipeline()

    def setup_pipeline(self):
        self.construct_pipeline()
        self.workflow.make_output_dirs()

    def run(self, task_flush):
        # put set_successful to False if you intend to add more tasks to the
        # pipeline later
        custom_log_dir = lambda task: os.path.join(self.config.rootdir, 'logs',
                                                   task.stage.name, task.uid)
        self.workflow.run(set_successful=False,
                          log_out_dir_func=custom_log_dir,
                          db_task_flush=task_flush)

    def construct_pipeline(self):
        # 1. remove unused alternates
        remove_ac_0_tasks = self.create_remove_ac_0_tasks(1)
        # 2. calculate sample missingness (counting phase)
        count_sample_missingness_tasks = self.create_count_sample_missingness_tasks(
            remove_ac_0_tasks, 2)
        # 2.1 calculate sample missingness (merge and calculation phase)
        calculate_sample_missingness_task = self.create_calculate_sample_missingness_task(
            count_sample_missingness_tasks, 2.1)
        # 3. denormalize, decompose, and uniq
        dnu_tasks = self.create_decompose_normalize_unique_tasks(
            remove_ac_0_tasks, 3)
        # 4. remove symbolic alleles
        rsa_tasks = self.create_remove_symbolic_deletion_tasks(dnu_tasks, 4)
        # 5. filter missingness
        filter_variant_missingness_tasks = self.create_filter_variant_missingness_tasks(
            rsa_tasks, 5)
        # 6. annotate allele balances
        allele_balance_annotation_tasks = self.create_allele_balance_annotation_tasks(
            filter_variant_missingness_tasks, 6)
        # 7. annotate with 1000G
        annotate_1000G_tasks = self.create_1000G_annotation_tasks(
            allele_balance_annotation_tasks, 7)
        # 8. annotate with ExAC
        annotate_ExAC_tasks = self.create_ExAC_annotation_tasks(
            annotate_1000G_tasks, 8)
        # 9. VEP annotation
        annotate_vep_cadd_tasks = self.create_vep_cadd_annotation_tasks(
            annotate_ExAC_tasks, 9)
        # 10. VCF concatenation
        concatenated_vcfs = self.create_concatenate_vcfs_task(
            annotate_vep_cadd_tasks, 10)
        # 11. bcftools stats
        bcftools_stats_tasks = self.create_bcftools_stats_tasks(
            annotate_ExAC_tasks, 11)
        # 11.1 Merge & Plot bcftools stats
        bcftools_stats_summary_task = self.create_bcftools_stats_summary_task(
            bcftools_stats_tasks, 11.1)
        # 12. GATK VariantEval
        variant_eval_tasks = self.create_variant_eval_tasks(
            annotate_ExAC_tasks, 12)
        # 12.1. Merge & Plot GATK VariantEval Stats
        variant_eval_summary_task = self.create_variant_eval_summary_task(
            variant_eval_tasks, 12.1)

    def create_bcftools_stats_summary_task(self, parent_tasks, step_number):
        stage = self._construct_task_name('bcftools-stats-summary',
                                          step_number)
        output_dir = os.path.join(self.config.rootdir, stage)

        prior_stage_name = parent_tasks[0].stage.name
        input_dir = os.path.join(self.config.rootdir, prior_stage_name)

        lsf_params = get_lsf_params(bcftools_stats_summary_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        task = {
            'func': bcftools_stats_summary,
            'params': {
                'in_dir': input_dir,
                'out_dir': output_dir,
            },
            'stage_name': stage,
            'uid': 'all-chroms',
            'drm_params': lsf_params_json,
            'parents': parent_tasks,
        }

        summary_task = self.workflow.add_task(**task)
        return summary_task

    def create_concatenate_vcfs_task(self, parent_tasks, step_number):
        tasks = list()
        stage = self._construct_task_name('concat-vcfs', step_number)
        output_dir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(concatenate_vcfs_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        def region_key(task):
            reference_fai = os.path.join(
                '/gscmnt/ams1102/info/model_data/2869585698/build106942997',
                'all_sequences.fa.fai')
            return Region(all_sequences.fa.fai, task.params['in_chrom'])

        def chromosome_key(task):
            reference_fai = os.path.join(
                '/gscmnt/ams1102/info/model_data/2869585698/build106942997',
                'all_sequences.fa.fai')
            return Region(reference_fai, task.params['in_chrom']).chrom

        for ref_chrom, chrom_tasks in groupby(sorted(parent_tasks,
                                                     key=region_key),
                                              key=chromosome_key):
            ptasks = list(chrom_tasks)
            input_vcfs = [x.params['out_vcf'] for x in ptasks]
            output_vcf = 'concatenated.c{}.vcf.gz'.format(ref_chrom)
            output_log = 'concatenate.{}.log'.format(ref_chrom)
            task = {
                'func': concatenate_vcfs,
                'params': {
                    'in_vcfs': input_vcfs,
                    'in_chrom': ref_chrom,
                    'out_vcf': os.path.join(output_dir, ref_chrom, output_vcf),
                    'out_log': os.path.join(output_dir, ref_chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=ref_chrom),
                'drm_params': lsf_params_json,
                'parents': ptasks,
            }
            tasks.append(self.workflow.add_task(**task))
        return tasks

    def create_variant_eval_summary_task(self, parent_tasks, step_number):
        stage = self._construct_task_name('gatk-variant-eval-summary',
                                          step_number)
        output_dir = os.path.join(self.config.rootdir, stage)

        prior_stage_name = parent_tasks[0].stage.name
        input_dir = os.path.join(self.config.rootdir, prior_stage_name)

        lsf_params = get_lsf_params(variant_eval_summary_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        task = {
            'func': variant_eval_summary,
            'params': {
                'in_dir': input_dir,
                'out_dir': output_dir,
            },
            'stage_name': stage,
            'uid': 'all-chroms',
            'drm_params': lsf_params_json,
            'parents': parent_tasks,
        }

        summary_task = self.workflow.add_task(**task)
        return summary_task

    def create_bcftools_stats_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('bcftools-stats', step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(bcftools_stats_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_stats = '{}.stats.out'.format(chrom)
            task = {
                'func': bcftools_stats,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_stats': os.path.join(basedir, chrom, output_stats),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_variant_eval_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('gatk-variant-eval', step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(gatk_variant_eval_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_stats = 'chrom-{}-variant-eval.out'.format(chrom)
            output_log = 'chrom-{}-variant-eval.log'.format(chrom)
            task = {
                'func': gatk_variant_eval,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_stats': os.path.join(basedir, chrom, output_stats),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_vep_cadd_annotation_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('vep-cadd-annotation', step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(annotation_vep_cadd_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'annotated.vep.cadd.c{}.vcf.gz'.format(chrom)
            output_log = 'vep.cadd.annotation.{}.log'.format(chrom)
            task = {
                'func': annotation_vep_cadd,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_ExAC_annotation_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('annotate-w-ExAC', step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(annotation_ExAC_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'ExAC-annotated.c{}.vcf.gz'.format(chrom)
            output_log = 'ExAC-annotate.{}.log'.format(chrom)
            task = {
                'func': annotation_ExAC,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_1000G_annotation_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('annotate-w-1000G', step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(annotation_1000G_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = '1kg-annotated.c{}.vcf.gz'.format(chrom)
            output_log = '1000G-annotate.{}.log'.format(chrom)
            task = {
                'func': annotation_1000G,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_allele_balance_annotation_tasks(self, parent_tasks,
                                               step_number):
        tasks = []
        stage = self._construct_task_name('allele-balance-annotation',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(annotate_allele_balances_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom)
            output_log = 'allele-balance-{}.log'.format(chrom)
            task = {
                'func': annotate_allele_balances,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_filter_variant_missingness_tasks(self, parent_tasks,
                                                step_number):
        tasks = []
        stage = self._construct_task_name('filter-variant-missingness',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(filter_variant_missingness_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom)
            output_log = 'filter-missingness-{}.log'.format(chrom)
            task = {
                'func': filter_variant_missingness,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_remove_symbolic_deletion_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('remove-symbolic-alleles',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(
            remove_symbolic_deletion_alleles_lsf_params, self.config.email,
            self.config.docker)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom)
            output_log = 'remove-symbolic-alleles-chrom-{}.log'.format(chrom)
            task = {
                'func': remove_symbolic_deletion_alleles,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_decompose_normalize_unique_tasks(self, parent_tasks,
                                                step_number):
        tasks = []
        stage = self._construct_task_name('decompose-normalize-uniq',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(normalize_decompose_unique_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']
            output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom)
            output_log = 'decompose-normalize-unique-{}.log'.format(chrom)
            task = {
                'func': normalize_decompose_unique,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_calculate_sample_missingness_task(self, parent_tasks,
                                                 step_number):
        stage = self._construct_task_name('calculate-sample-missingness',
                                          step_number)
        output_dir = os.path.join(self.config.rootdir, stage)

        prior_stage_name = parent_tasks[0].stage.name
        input_dir = os.path.join(self.config.rootdir, prior_stage_name)
        input_json_wildcard_path = os.path.join(input_dir, '*', '*.json')

        lsf_params = get_lsf_params(calculate_sample_missingness_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        task = {
            'func': calculate_sample_missingness,
            'params': {
                'in_json':
                input_json_wildcard_path,
                'out_stats':
                os.path.join(output_dir, 'sample-missingness-pct.dat'),
                'out_log':
                os.path.join(output_dir, 'sample-missingness-pct.dat.log'),
            },
            'stage_name': stage,
            'uid': '1-22',
            'drm_params': lsf_params_json,
            'parents': parent_tasks,
        }

        summary_task = self.workflow.add_task(**task)
        return summary_task

    def create_count_sample_missingness_tasks(self, parent_tasks, step_number):
        tasks = []
        stage = self._construct_task_name('count-sample-missingness',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(count_sample_missingness_lsf_params,
                                    self.config.email, self.config.docker,
                                    self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for ptask in parent_tasks:
            chrom = ptask.params['in_chrom']

            # only count missing genotypes on chromosomes 1-22 (not X, Y, or MT)
            if not chrom[0].isdigit(): continue

            output_json = '{chrom}-sample-missingness-counts.json'.format(
                chrom=chrom)
            output_log = '{}-sample-missingness-counts.log'.format(chrom)
            task = {
                'func': count_sample_missingness,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_chrom': chrom,
                    'out_json': os.path.join(basedir, chrom, output_json),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_remove_ac_0_tasks(self, step_number):
        tasks = []
        stage = self._construct_task_name('select-variants-ac-0-removal',
                                          step_number)
        basedir = os.path.join(self.config.rootdir, stage)

        lsf_params = get_lsf_params(
            gatk_select_variants_remove_ac_0_lsf_params, self.config.email,
            self.config.docker, self.config.drm_queue)
        lsf_params_json = to_json(lsf_params)

        for chrom in self.config.chroms:
            vcf = self.config.vcfs[chrom]
            output_vcf = 'combined.c{chrom}.vcf.gz'.format(chrom=chrom)
            output_log = 'select-variants-chrom-{}-gatk.log'.format(chrom)
            task = {
                'func': gatk_select_variants_remove_ac_0,
                'params': {
                    'in_chrom': chrom,
                    'in_vcf': vcf,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'out_log': os.path.join(basedir, chrom, output_log),
                },
                'stage_name': stage,
                'uid': '{chrom}'.format(chrom=chrom),
                'drm_params': lsf_params_json,
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def _construct_task_name(self, name, number):
        task_name = '{}-{}'.format(number, name)
        return task_name
Exemple #4
0
def main():
    args = parse_args()

    cosmos = Cosmos(
        "sqlite:///%s/sqlite.db" % os.path.dirname(os.path.abspath(__file__)),
        default_drm="awsbatch",
        default_drm_options=dict(
            container_image=args.container_image,
            s3_prefix_for_command_script_temp_files=args.s3_prefix_for_command_script_temp_files,
            # only retry on spot instance death
            retry_only_if_status_reason_matches="Host EC2 .+ terminated.",
        ),
        default_queue=args.default_queue,
    )

    cosmos.initdb()

    # sp.check_call("mkdir -p analysis_output/ex1", shell=True)
    # os.chdir("analysis_output/ex1")
    workflow = cosmos.start(f"Evaluate_{args.id}", restart=True, skip_confirm=True)

    parameters = np.load(f"optimize_awsbatch/parameters/{args.id}.npy")

    for i, par in enumerate(parameters):
        parameters_ = dict(
            mean_weight=par[0],
            c_w=par[1],
            tau_pos=par[2],
            tau_neg=par[3],
            A_pos=par[4],
            A_neg=par[5],
            weight_decay=par[6],
            n_filters=25,
            time_max=250,
            crop=20,
            kernel_size=16,
            stride=4,
            intensity=127.5,
            c_w_min=None,
            c_l=True,
            network_type="LC_SNN",

        )
        workflow.add_task(
            func=evaluate,
            params=dict(
                parameters=parameters_,
                out_s3_uri=f"{args.out_s3_uri}/scores/{args.id}/{i}.json",
                sleep=args.sleep,
                train=args.train,
                calibrate=args.calibrate,
                test=args.test
            ),
            uid=str(i),
            time_req=None,
            max_attempts=args.max_attempts,
            core_req=args.core_req,
            mem_req=args.mem_req,
        )
    workflow.run()

    sys.exit(0 if workflow.successful else 1)
Exemple #5
0
class Pipeline(object):
    def __init__(self, config, drm, restart):
        self.config = config

        self.cosmos = Cosmos(database_url='sqlite:///{}'.format(
            self.config.db),
                             get_submit_args=default_get_submit_args,
                             default_drm=drm)

        self.cosmos.initdb()

        primary_logfile = os.path.join(
            self.config.rootdir,
            '{}.log'.format(self.config.project_name),
        )

        self.workflow = self.cosmos.start(
            self.config.project_name,
            primary_log_path=primary_logfile,
            restart=restart,
        )

        self.setup_pipeline()

    def setup_pipeline(self):
        self.construct_pipeline()
        self.workflow.make_output_dirs()

    def run(self):
        # put set_successful to False if you intend to add more tasks to the
        # pipeline later
        custom_log_dir = lambda task: os.path.join(self.config.rootdir, 'logs',
                                                   task.stage.name, task.uid)
        self.workflow.run(set_successful=False,
                          log_out_dir_func=custom_log_dir)

    def construct_pipeline(self):
        partition_tasks = self.create_vcf_partition_tasks()
        plink_pipeline_tasks = self.create_plink_pipeline_tasks(
            partition_tasks)
        aggregate_mie_stats_tasks = self.create_aggregate_mie_stats_tasks(
            plink_pipeline_tasks)

    def create_aggregate_mie_stats_tasks(self, parent_tasks):
        tasks = []
        stage = '3-aggregate-mie-stats'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        input_dir = os.path.join(self.config.rootdir,
                                 parent_tasks[0].stage.name)

        parent_snp_tranche_tasks = \
            [task for task in parent_tasks \
                  if (task.params['type'] == 'snps' and task.params['method'] == 'tranche') ]

        parent_snp_percentile_tasks = \
            [task for task in parent_tasks \
                  if (task.params['type'] == 'snps' and task.params['method'] == 'percentile') ]

        parent_indel_tranche_tasks = \
            [task for task in parent_tasks \
                  if (task.params['type'] == 'indels' and task.params['method'] == 'tranche') ]

        parent_indel_percentile_tasks = \
            [task for task in parent_tasks \
                  if (task.params['type'] == 'indels' and task.params['method'] == 'percentile') ]

        task_groups = (parent_snp_tranche_tasks, parent_snp_percentile_tasks,
                       parent_indel_tranche_tasks,
                       parent_indel_percentile_tasks)

        for tgroup in task_groups:
            category = tgroup[0].params['type']
            method = tgroup[0].params['method']
            out_filename = '.'.join([category, method, 'tsv'])
            output_file = os.path.join(basedir, out_filename)
            task = {
                'func':
                aggregate_mie_statistics,
                'params': {
                    'in_category': category,
                    'in_method': method,
                    'in_dir': input_dir,
                    'out_file': output_file,
                },
                'stage_name':
                stage,
                'uid':
                '{category}:{method}'.format(method=method, category=category),
                'drm_params':
                to_json(aggregate_mie_statistics_lsf_params(email)),
                'parents':
                tgroup,
            }
            tasks.append(self.workflow.add_task(**task))

    def create_plink_pipeline_tasks(self, parent_tasks):
        tasks = []
        stage = '2-plink-pipeline'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        for ptask in sorted(parent_tasks, key=lambda t: t.id):
            chrom = ptask.params['in_chrom']
            label = ptask.params['in_label']
            method = ptask.params['in_method']
            category = ptask.params['in_type']

            output_dir = os.path.join(basedir, category, method, label, chrom)
            #            ensure_directory(output_dir)

            task = {
                'func':
                plink_pipeline,
                'params': {
                    'in_vcf': ptask.params['out_vcf'],
                    'in_trio_fam': self.config.plink_fam_file,
                    'chrom': chrom,
                    'type': category,
                    'method': method,
                    'chrom': chrom,
                    'label': label,
                    'out_dir': output_dir,
                },
                'stage_name':
                stage,
                'uid':
                '{category}:{method}:{label}:{chrom}'.format(chrom=chrom,
                                                             method=method,
                                                             category=category,
                                                             label=label),
                'drm_params':
                to_json(plink_pipeline_lsf_params(email)),
                'parents': [ptask],
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks

    def create_vcf_partition_tasks(self):
        all_tasks = []
        cases = (
            ('tranche', 'snps', self.config.tranche_intervals['snps']),
            ('tranche', 'indels', self.config.tranche_intervals['indels']),
            ('percentile', 'snps', self.config.percentiles['snps']),
            ('percentile', 'indels', self.config.percentiles['indels']),
        )

        for case in cases:
            tasks = self.generate_vcf_partition_tasks(*case)
            all_tasks.extend(tasks)

        return all_tasks

    def generate_vcf_partition_tasks(self, method, category, intervals):
        # method: 'tranche' or 'percentile'
        # category: 'snps' or 'indels'
        # label:
        #     tranche : 1, 2 or 3
        #     percentile : 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
        tasks = []
        for label in sorted(intervals.keys()):
            interval = intervals[label]
            partition_tasks = self.create_vcf_partition_chromosome_tasks(
                method=method,
                label=str(label),
                category=category,
                interval=interval,
            )
            tasks.extend(partition_tasks)

        return tasks

    def create_vcf_partition_chromosome_tasks(self, method, label, category,
                                              interval):
        tasks = []
        stage = '1-partition-vcfs'
        basedir = os.path.join(self.config.rootdir, stage, category, method,
                               label)
        email = self.config.email

        for chrom in self.config.chroms:
            vcf = self.config.vcfs[chrom]
            output_vcf = 'selected.c{chrom}.vcf.gz'.format(chrom=chrom)
            task = {
                'func':
                vcf_partition,
                'params': {
                    'in_vcf': vcf,
                    'out_vcf': os.path.join(basedir, chrom, output_vcf),
                    'in_min_vqslod': interval[0],
                    'in_max_vqslod': interval[1],
                    'in_samples': self.config.control_samples_file,
                    'in_type': category,
                    'in_method': method,
                    'in_chrom': chrom,
                    'in_label': label,
                },
                'stage_name':
                stage,
                'uid':
                '{category}:{method}:{label}:{chrom}'.format(chrom=chrom,
                                                             method=method,
                                                             category=category,
                                                             label=label),
                'drm_params':
                to_json(vcf_partition_lsf_params(email)),
            }
            tasks.append(self.workflow.add_task(**task))

        return tasks
Exemple #6
0
class Pipeline(object):
    def __init__(self, config, drm, restart):
        self.config = config

        self.cosmos = Cosmos(
            database_url='sqlite:///{}'.format(self.config.db),
            get_submit_args=default_get_submit_args,
            default_drm=drm
        )

        self.cosmos.initdb()

        primary_logfile = os.path.join(
            self.config.rootdir,
            '{}.log'.format(self.config.project_name),
        )

        self.workflow = self.cosmos.start(
            self.config.project_name,
            primary_log_path=primary_logfile,
            restart=restart,
        )

        self.setup_pipeline()

    def setup_pipeline(self):
        self.construct_pipeline()
        self.workflow.make_output_dirs()

    def run(self):
	# put set_successful to False if you intend to add more tasks to the
	# pipeline later
        custom_log_dir = lambda task : os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid)
        self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir)

    def construct_pipeline(self):
        filter_biallelic_snps_tasks = self.create_filter_biallelic_snps_tasks()
        plink_binary_tasks = self.create_plink_binary_tasks(filter_biallelic_snps_tasks)
        plink_ld_prune_tasks = self.create_plink_ld_prune_tasks(plink_binary_tasks)
        plink_extract_prune_tasks = self.create_plink_extract_prune_tasks(plink_ld_prune_tasks)
        plink_merge_prune_files_task = self.create_plink_merge_prune_file_task(plink_extract_prune_tasks)
        eigenstrat_task = self.create_eigenstrat_smartpca_task(plink_merge_prune_files_task)
        data_frame_task = self.create_data_frame_task(eigenstrat_task)

    def create_data_frame_task(self, parent_task):
        stage = '7-make-data-frame'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        pca_evec_file = os.path.join(
            parent_task.params['out_prj_dir'],
            'merged.eigenstrat.pca.evec',
        )

        out_file = os.path.join(basedir, 'merged.eigenstrat.pca.evec.tsv')

        task = {
            'func' : create_evec_data_frame,
            'params' : {
                'in_file' : pca_evec_file,
                'out_file' : out_file,
            },
            'stage_name' : stage,
            'uid' : 'all-chroms',
            'drm_params' :
                to_json(create_evec_data_frame_lsf_params(email)),
            'parents' : [ parent_task ],
        }

        df_task = self.workflow.add_task(**task)

        return df_task

    def create_eigenstrat_smartpca_task(self, parent_task):
        stage = '6-eigenstrat-smartpca'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        ped_file = "{}.ped".format(parent_task.params['out_path'])
        map_file = "{}.map".format(parent_task.params['out_path'])

        task = {
            'func' : eigenstrat_smartpca_analysis,
            'params' : {
                'in_ped_file' : ped_file,
                'in_map_file' : map_file,
                'out_prj_dir' : basedir,
            },
            'stage_name' : stage,
            'uid' : 'all-chroms',
            'drm_params' :
                to_json(eigenstrat_smartpca_analysis_lsf_params(email)),
            'parents' : [ parent_task ],
        }

        eigenstrat_task = self.workflow.add_task(**task)

        return eigenstrat_task

    def create_plink_merge_prune_file_task(self, parent_tasks):
        stage = '5-plink-merge-prune-files'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        parent_tasks_sorted = sorted(parent_tasks, key=lambda t: t.id)

        first_task = parent_tasks_sorted[0]
        remaining_tasks = parent_tasks_sorted[1:]

        merge_list_file = os.path.join(basedir, 'allfiles.txt')
        self._create_merge_list(merge_list_file, remaining_tasks)

        output_path = os.path.join(basedir, 'merged')

        task = {
            'func' : plink_merge_pruned_files,
            'params' : {
                'in_ref' : first_task.params['out_path'],
                'in_merge_file' : merge_list_file,
                'out_path' : output_path,
            },
            'stage_name' : stage,
            'uid' : 'all-chroms',
            'drm_params' :
                to_json(plink_merge_pruned_files_lsf_params(email)),
            'parents' : parent_tasks_sorted,
        }

        merge_task = self.workflow.add_task(**task)

        return merge_task


    def _create_merge_list(self, merge_file, tasks):
        ensure_directory(os.path.dirname(merge_file))
        with open(merge_file, 'w') as f:
            for t in tasks:
                print(t.params['out_path'], file=f)

    def create_plink_extract_prune_tasks(self, parent_tasks):
        tasks = []
        stage = '4-plink-extract-prune'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        for ptask in sorted(parent_tasks, key=lambda t: t.id):
            plink_extract_file = "{}.prune.in".format(ptask.params['out_path'])
            orig_binary_data = ptask.params['in_path']
            chrom = ptask.params['chrom']
            output_path = os.path.join(basedir, chrom, 'c{}.extracted'.format(chrom))

            task = {
                'func' : plink_extract_prune,
                'params' : {
                    'in_path' : orig_binary_data,
                    'in_extract' : plink_extract_file,
                    'out_path' : output_path,
                    'chrom' : chrom,
                },
                'stage_name' : stage,
                'uid' : '{chrom}'.format(chrom=chrom),
                'drm_params' :
                    to_json(plink_extract_prune_lsf_params(email)),
                'parents' : [ptask],
            }
            tasks.append( self.workflow.add_task(**task) )

        return tasks

    def create_plink_ld_prune_tasks(self, parent_tasks):
        tasks = []
        stage = '3-plink-ld-prune'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        for ptask in sorted(parent_tasks, key=lambda t: t.id):
            chrom = ptask.params['chrom']
            output_path = os.path.join(basedir, chrom, 'c{}-pruned'.format(chrom))

            task = {
                'func' : plink_ld_prune,
                'params' : {
                    'in_path' : ptask.params['out_path'],
                    'out_path' : output_path,
                    'chrom' : chrom,
                },
                'stage_name' : stage,
                'uid' : '{chrom}'.format(chrom=chrom),
                'drm_params' :
                    to_json(plink_ld_prune_lsf_params(email)),
                'parents' : [ptask],
            }
            tasks.append( self.workflow.add_task(**task) )

        return tasks

    def create_plink_binary_tasks(self, parent_tasks):
        tasks = []
        stage = '2-plink-binaries'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        for ptask in sorted(parent_tasks, key=lambda t: t.id):
            chrom = ptask.params['chrom']
            output_path = os.path.join(basedir, chrom, 'c{}'.format(chrom))

            task = {
                'func' : plink_binary,
                'params' : {
                    'in_vcf' : ptask.params['out_vcf'],
                    'out_path' : output_path,
                    'chrom' : chrom,
                },
                'stage_name' : stage,
                'uid' : '{chrom}'.format(chrom=chrom),
                'drm_params' :
                    to_json(plink_binary_lsf_params(email)),
                'parents' : [ptask],
            }
            tasks.append( self.workflow.add_task(**task) )

        return tasks

    def create_filter_biallelic_snps_tasks(self):
        tasks = []
        stage = '1-filter-biallelic-snps'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email

        for chrom in self.config.chroms:
            vcf = self.config.vcfs[chrom]
            output_vcf = 'filtered.snps.c{chrom}.vcf.gz'.format(chrom=chrom)
            task = {
                'func'   : filter_biallelic_snps,
                'params' : {
                    'chrom' : chrom,
                    'in_vcf' : vcf,
                    'out_vcf' : os.path.join(basedir, chrom, output_vcf),
                    'in_min_vqslod' : self.config.vqslod_threshold,
                },
                'stage_name' : stage,
                'uid' : '{chrom}'.format(chrom=chrom),
                'drm_params' :
                    to_json(filter_biallelic_snps_lsf_params(email)),
            }
            tasks.append( self.workflow.add_task(**task) )

        return tasks
Exemple #7
0
class Pipeline(object):
    def __init__(self, config, drm, restart):
        self.config = config

        self.cosmos = Cosmos(
            database_url='sqlite:///{}'.format(self.config.db),
            get_submit_args=default_get_submit_args,
            default_drm=drm
        )

        self.cosmos.initdb()

        primary_logfile = os.path.join(
            self.config.rootdir,
            '{}.log'.format(self.config.project_name),
        )

        self.workflow = self.cosmos.start(
            self.config.project_name,
            primary_log_path=primary_logfile,
            restart=restart,
        )

        self.setup_pipeline()

    def setup_pipeline(self):
        self.construct_pipeline()
        self.workflow.make_output_dirs()

    def run(self):
	# put set_successful to False if you intend to add more tasks to the
	# pipeline later
        custom_log_dir = lambda task : os.path.join(self.config.rootdir, 'logs', task.stage.name, task.uid)
        self.workflow.run(set_successful=False, log_out_dir_func=custom_log_dir)

    def construct_pipeline(self):
        speedseq_tasks = self.create_speedseq_realign_tasks()

    def create_speedseq_realign_tasks(self):
        tasks = []
        stage = '1-exec-speedseq-realign'
        basedir = os.path.join(self.config.rootdir, stage)
        email = self.config.email
        lsf_job_group = self.config.drm_job_group
        sample_data = self.config.sample_data

        for sample_id in sample_data.keys():
            bam_paths = sample_data[sample_id]['bams']
            sample_name = sample_data[sample_id]['meta']['original-name']
            output_prefix = os.path.join(basedir, sample_id, "{}.b38.realign".format(sample_id))
            tmpdir = os.path.join(basedir, sample_id, 'tmpdir')
            input_bams = ' '.join(bam_paths)

            task = {
                'func'   : exec_speedseq,
                'params' : {
                    'output_prefix' : output_prefix,
                    'tmpdir' : tmpdir,
                    'input_bams' : input_bams,
                },
                'stage_name' : stage,
                'uid' : sample_id,
                'drm_params' :
                    to_json(exec_speedseq_lsf_params(email, lsf_job_group)),
            }
            tasks.append( self.workflow.add_task(**task) )

        return tasks