コード例 #1
0
ファイル: __init__.py プロジェクト: diljotgrewal/wgs
def create_lumpy_workflow(lumpy_vcf,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=False):
    workflow = pypeliner.workflow.Workflow()

    lumpy_job_name = 'run_lumpy'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam)
        normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam')
        normal_split = mgd.TempInputFile('normal.splitters.sorted.bam')
        lumpy_job_name += '_normal'
    else:
        normal_disc = None
        normal_split = None

    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam)
        tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam')
        tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam')
        lumpy_job_name += '_tumour'
    else:
        tumour_disc = None
        tumour_split = None

    if normal_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_normal',
            func=lumpy_preprocess_workflow,
            args=(normal_bam,
                  mgd.TempOutputFile('normal.discordants.sorted.bam'),
                  mgd.TempOutputFile('normal.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    if tumour_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_tumour',
            func=lumpy_preprocess_workflow,
            args=(tumour_bam,
                  mgd.TempOutputFile('tumour.discordants.sorted.bam'),
                  mgd.TempOutputFile('tumour.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    workflow.transform(
        name=lumpy_job_name,
        ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'),
        func='wgs.workflows.lumpy.tasks.run_lumpyexpress',
        args=(mgd.OutputFile(lumpy_vcf),
              config.default_params('breakpoint_calling')['lumpy_paths']),
        kwargs={
            'tumour_bam': tumour_bam,
            'tumour_discordants': tumour_disc,
            'tumour_splitters': tumour_split,
            'normal_bam': normal_bam,
            'normal_discordants': normal_disc,
            'normal_splitters': normal_split,
            'docker_image': config.containers('lumpy')
        })

    return workflow
コード例 #2
0
def create_consensus_workflow(
        destruct_breakpoints,
        lumpy_vcf,
        output,
        chromosomes
):

    params = config.default_params('breakpoint_calling')
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_lumpy',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task',
        args=(
            mgd.InputFile(lumpy_vcf),
            mgd.TempOutputFile('lumpy.csv'),
            params["parse_lumpy"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='parse_destruct',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task',
        args=(
            mgd.InputFile(destruct_breakpoints),
            mgd.TempOutputFile('destruct.csv'),
            params["parse_destruct"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='consensus_breakpoint_calling',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls',
        args=(
            mgd.TempInputFile('destruct.csv'),
            mgd.TempInputFile('lumpy.csv'),
            mgd.OutputFile(output, extensions=['.yaml']),
            params['consensus']
        ),
    )

    return workflow
コード例 #3
0
ファイル: __init__.py プロジェクト: DouglasAbrams/wgs
def create_remixt_workflow(
    tumour_path,
    normal_path,
    breakpoints,
    sample_id,
    remixt_results_filename,
    remixt_brk_cn_csv,
    remixt_cn_csv,
    remixt_minor_modes_csv,
    remixt_mix_csv,
    remixt_read_depth_csv,
    remixt_stats_csv,
    remixt_refdata,
    reference,
    single_node=False,
):
    ctx = {'docker_image': config.containers('wgs')}

    params = config.default_params('copynumber_calling')['remixt']

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    remixt_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
    }

    if breakpoints is None:
        workflow.setobj(
            obj=mgd.TempOutputObj('emptybreakpoints'),
            value=[],
        )

        workflow.transform(
            name='write_empty_breakpoints',
            func='wgs.workflows.remixt.tasks.write_empty_breakpoints',
            args=(
                mgd.TempInputObj('emptybreakpoints'),
                mgd.TempOutputFile('filtered_breakpoints.csv'),
            ),
        )

    else:
        workflow.transform(
            name='filter_breakpoints',
            func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints',
            ctx=helpers.get_default_ctx(memory=4, walltime='4:00'),
            args=(mgd.InputFile(breakpoints),
                  mgd.TempOutputFile('filtered_breakpoints.csv'),
                  params['min_num_reads']))

    if single_node:
        workflow.transform(
            name='remixt',
            func='wgs.workflows.remixt.tasks.run_remixt_local',
            ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8),
            args=(
                mgd.TempSpace("remixt_temp"),
                mgd.TempInputFile('filtered_breakpoints.csv'),
                mgd.InputFile(tumour_path, extensions=['.bai']),
                mgd.InputFile(normal_path, extensions=['.bai']),
                sample_id,
                mgd.OutputFile(remixt_results_filename),
                mgd.TempSpace('remixt_raw_dir'),
                remixt_config,
                remixt_refdata,
            ),
        )
    else:
        workflow.subworkflow(name='remixt',
                             func="remixt.workflow.create_remixt_bam_workflow",
                             ctx={
                                 'docker_image': config.containers('remixt'),
                                 'walltime': '48:00'
                             },
                             args=(
                                 mgd.TempInputFile('filtered_breakpoints.csv'),
                                 {
                                     sample_id:
                                     mgd.InputFile(tumour_path,
                                                   extensions=['.bai']),
                                     sample_id + 'N':
                                     mgd.InputFile(normal_path,
                                                   extensions=['.bai'])
                                 },
                                 {
                                     sample_id:
                                     mgd.OutputFile(remixt_results_filename)
                                 },
                                 mgd.TempSpace('remixt_raw_dir'),
                                 remixt_config,
                                 remixt_refdata,
                             ),
                             kwargs={
                                 'normal_id': sample_id + 'N',
                             })

    workflow.transform(
        name='parse_remixt',
        func='wgs.workflows.remixt.tasks.parse_remixt_file',
        args=(mgd.InputFile(remixt_results_filename), [
            mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']),
        ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth',
            '/stats'], mgd.TempSpace('tempdir_parse')))

    return workflow
コード例 #4
0
def create_titan_workflow(
        tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs,
        parsed, plots, tar_outputs, museq_vcf,
        sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf,
        single_node=None
):
    cn_params = config.default_params('copynumber_calling')

    chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']]

    targets = mgd.InputFile(targets) if targets else None

    ctx = {'docker_image': config.containers('wgs')}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.titan.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='2:00', ),
        ret=mgd.OutputChunks('interval'),
        args=(
            reference,
            chromosomes,
        ),
        kwargs={'size': cn_params['split_size']}
    )

    if single_node:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='96:00',
                ncpus=8),
            func='wgs.utils.museq_utils.run_museq_one_job',
            args=(
                mgd.TempSpace("run_museq_temp"),
                mgd.OutputFile(museq_vcf),
                reference,
                mgd.InputChunks('interval'),
                cn_params['museq_params'],
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'museq_docker_image': config.containers('mutationseq'),
                'vcftools_docker_image': config.containers('vcftools')
            }
        )
    else:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00'),
            axes=('interval',),
            func='wgs.utils.museq_utils.run_museq',
            args=(
                mgd.TempOutputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq.log', 'interval'),
                reference,
                mgd.InputInstance('interval'),
                cn_params['museq_params']
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'docker_image': config.containers('mutationseq')
            }
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='4:00', ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.OutputFile(museq_vcf),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')}
        )

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.convert_museq_vcf2counts',
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            het_positions,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_correctreads_wig',
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            gc_wig,
            map_wig,
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='run_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='72:00',
            ncpus='8'),
        func='wgs.workflows.titan.tasks.run_titan',
        args=(
            mgd.TempInputFile('museq_postprocess.txt'),
            mgd.TempInputFile('correct_reads.txt'),
            mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy'),
            sample_id,
            map_wig,
            cn_params['titan_params'],
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan'), 'threads': '8'}
    )

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00', ),
        func='wgs.workflows.titan.tasks.plot_titan',
        args=(
            mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'),
            mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy')
        ),
        kwargs={
            'chromosomes': chromosomes,
            'docker_image': config.containers('titan'),
        },
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_cnsegments_titan',
        args=(
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
            sample_id,
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.annot_pygenes',
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            pygenes_gtf,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.parse_titan_data',
        args=(
            mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'),
        ),
    )

    # select optimal solution
    workflow.transform(
        name="select_optimal_solution",
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.select_optimal_solution",
        args=(
            chunks,
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(segs, extensions=['.yaml']),
            mgd.OutputFile(igv_segs, extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
            mgd.OutputFile(outfile, extensions=['.yaml']),
            mgd.OutputFile(parsed, extensions=['.yaml']),
            mgd.OutputFile(plots),
        )
    )

    workflow.transform(
        name='tar_all_data',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.tar_all_data",
        args=(
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(tar_outputs),
            mgd.TempSpace("titan_all_parameters_data"),
            chunks
        )
    )

    return workflow
コード例 #5
0
def create_mutect_workflow(normal_bam,
                           tumour_bam,
                           snv_vcf,
                           snv_maf,
                           reference,
                           reference_vep,
                           chromosomes,
                           normal_id,
                           tumour_id,
                           single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.mutect.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='mutect_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.mutect.tasks.run_mutect_one_job',
            args=(mgd.TempSpace("run_mutect_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam)),
        )
    else:
        workflow.transform(
            name='mutect_caller',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.mutect.tasks.run_mutect',
            args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam),
                  mgd.TempSpace('mutect_temp', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.workflows.mutect.tasks.merge_vcfs',
            args=(
                mgd.TempInputFile('mutect.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
コード例 #6
0
def create_museq_workflow(snv_vcf,
                          museqportrait_pdf,
                          reference,
                          chromosomes,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam':
                               tumour_bam,
                               'normal_bam':
                               normal_bam,
                               'museq_docker_image':
                               config.containers('mutationseq'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(
                               mgd.TempOutputFile('museq.vcf', 'interval'),
                               mgd.TempOutputFile('museq.log', 'interval'),
                               reference,
                               mgd.InputInstance('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                               'docker_image':
                               config.containers('mutationseq'),
                           })

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(snv_vcf, extensions=['.tbi',
                                                               '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'docker_image': config.containers('mutationseq'),
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    return workflow
コード例 #7
0
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv,
                              strelka_indel, somatic_calls, somatic_snpeff,
                              somatic_ma, somatic_ids, indel_calls,
                              indel_snpeff, indel_ma, indel_ids,
                              germline_calls, germline_snpeff, germline_ma,
                              germline_ids, refdir):
    params = config.default_params('variant_calling')
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_museq_germlines',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']),
              mgd.OutputFile(germline_calls, extensions=['.yaml']),
              mgd.OutputFile(germline_snpeff, extensions=['.yaml']),
              mgd.OutputFile(germline_ma, extensions=['.yaml']),
              mgd.OutputFile(germline_ids,
                             extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_germlines")),
    )

    workflow.transform(
        name='parse_strelka_indel',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']),
              mgd.OutputFile(indel_calls, extensions=['.yaml']),
              mgd.OutputFile(indel_snpeff, extensions=['.yaml']),
              mgd.OutputFile(indel_ma, extensions=['.yaml']),
              mgd.OutputFile(indel_ids,
                             extensions=['.yaml']), params["parse_strelka"],
              chromosomes, mgd.TempSpace("tempdir_strelka_indel")),
    )

    workflow.transform(
        name='parse_museq_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ids.csv',
                                 extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")),
    )

    workflow.transform(
        name='parse_strelka_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_snpeff.csv',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']),
              params["parse_strelka"], chromosomes,
              mgd.TempSpace("tempdir_parse_strelka_snv")),
    )

    workflow.transform(
        name='merge_snvs',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_snv.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_calls, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_snpeff',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_snpeff.csv',
                                  extensions=['.yaml']),
                mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_snpeff, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ma',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ma.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ma, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ids',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ids.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ids, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    return workflow
コード例 #8
0
ファイル: __init__.py プロジェクト: diljotgrewal/wgs
def lumpy_preprocess_workflow(bamfile,
                              discordants_sorted_bam,
                              splitters_sorted_bam,
                              single_node=False):
    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='run_lumpy_preprocess',
            ctx=helpers.get_default_ctx(memory=10, walltime='96:00', disk=300),
            func='wgs.workflows.lumpy.tasks.run_lumpy_preprocess',
            args=(mgd.InputFile(bamfile),
                  mgd.OutputFile(discordants_sorted_bam),
                  mgd.OutputFile(splitters_sorted_bam),
                  mgd.TempSpace("lumpy_preprocess_temp"),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={
                'lumpy_docker_image': config.containers('lumpy'),
                'samtools_docker_image': config.containers('samtools')
            })
    else:
        workflow.transform(
            name='run_samtools_view_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_view',
            args=(
                mgd.InputFile(bamfile),
                mgd.TempOutputFile('normal.discordants.unsorted.bam'),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_lumpy_extract_split_reads_bwamem_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func=
            'wgs.workflows.lumpy.tasks.run_lumpy_extract_split_reads_bwamem',
            args=(mgd.InputFile(bamfile),
                  mgd.TempOutputFile('normal.splitters.unsorted.bam'),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={'docker_image': config.containers('lumpy')})

        workflow.transform(
            name='run_samtools_sort_discordants_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.discordants.unsorted.bam'),
                mgd.OutputFile(discordants_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_samtools_sort_splitters_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.splitters.unsorted.bam'),
                mgd.OutputFile(splitters_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

    return workflow
コード例 #9
0
def create_hmmcopy_workflow(
    bam_file,
    sample_id,
    bias_pdf,
    correction_pdf,
    hmmcopy_pdf,
    hmmcopy_table,
    pygenes_table,
    chromosomes,
    map_wig,
    gc_wig,
    pygenes_gtf,
):
    cn_params = config.default_params()['copynumber_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='hmmcopy_readcounter',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='2:00',
                       ),
                       func='wgs.workflows.hmmcopy.tasks.hmmcopy_readcounter',
                       args=(
                           mgd.InputFile(bam_file, extensions=['.bai']),
                           mgd.TempOutputFile('infile.wig'),
                           chromosomes,
                           cn_params['readcounter'],
                       ))

    workflow.transform(
        name='calc_corr',
        func='wgs.workflows.hmmcopy.tasks.calc_corr',
        args=(
            mgd.TempInputFile('infile.wig'),
            mgd.TempOutputFile('infile_copy.txt'),
            mgd.TempOutputFile('infile_copy.obj'),
            gc_wig,
            map_wig,
            cn_params['map_cutoff'],
        ),
    )

    workflow.transform(
        name='run_hmmcopy',
        func='wgs.workflows.hmmcopy.tasks.run_hmmcopy',
        args=(
            mgd.TempInputFile('infile_copy.obj'),
            mgd.TempInputFile('infile_copy.txt'),
            mgd.TempOutputFile('hmmcopy_res.obj'),
            mgd.TempOutputFile('hmmcopy_segments.txt'),
            mgd.OutputFile(hmmcopy_table),
            sample_id,
            cn_params['hmmcopy_params'],
        ),
    )

    workflow.transform(
        name='plot_hmm',
        func='wgs.workflows.hmmcopy.tasks.plot_hmm',
        args=(
            mgd.TempInputFile('infile_copy.obj'),
            mgd.TempInputFile('hmmcopy_res.obj'),
            mgd.TempSpace('correction_plots_dir'),
            mgd.TempSpace('hmmcopy_plots_dir'),
            mgd.OutputFile(bias_pdf),
            mgd.OutputFile(correction_pdf),
            mgd.OutputFile(hmmcopy_pdf),
        ),
    )

    workflow.transform(name='annot_hmm',
                       func='wgs.workflows.hmmcopy.tasks.annot_hmm',
                       args=(
                           mgd.TempInputFile('hmmcopy_segments.txt'),
                           mgd.OutputFile(pygenes_table),
                           pygenes_gtf,
                       ))

    return workflow
コード例 #10
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
コード例 #11
0
def collect_bam_metrics(bam,
                        markdups_metrics,
                        sample_id,
                        refdir,
                        metrics,
                        picard_insert_metrics,
                        picard_insert_pdf,
                        flagstat_metrics,
                        picard_gc_metrics,
                        picard_gc_summary,
                        picard_gc_pdf,
                        picard_wgs_metrics,
                        bam_tdf,
                        picard_mem=8):
    '''
    calculates bam metrics in bams
    1. picard insert metrics
    2. picard GC metrics
    3. picard wgs metrics
    4. fastqc metrics

    :param config: config
    images for metrics
    :param bams: sample:bam dictionary
    :param metrics_csv: output csv containing
        metrics
    :param single_node:
    '''

    ref_genome = config.refdir_data(refdir)['paths']['reference']

    picard_wgs_params = config.default_params('alignment')['picard_wgs_params']

    reftype = config.refdir_data(refdir)['params']['reference_type']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="calc_picard_insert_metrics",
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics',
        args=(
            mgd.InputFile(bam),
            mgd.OutputFile(flagstat_metrics),
            mgd.OutputFile(picard_insert_metrics),
            mgd.OutputFile(picard_insert_pdf),
            mgd.TempSpace('picard_insert'),
        ),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_gc_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_gc_metrics),
              mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf),
              mgd.TempSpace('picard_gc')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_wgs_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_wgs_metrics), picard_wgs_params,
              mgd.TempSpace('picard_wgs')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='igvtools_tdf',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.get_igvtools_count',
        args=(pypeliner.managed.InputFile(bam),
              pypeliner.managed.OutputFile(bam_tdf), reftype),
    )

    workflow.transform(
        name='collect_metrics',
        func='wgs.workflows.alignment.tasks.bam_collect_all_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400),
        args=(mgd.InputFile(flagstat_metrics),
              mgd.InputFile(picard_insert_metrics),
              mgd.InputFile(picard_wgs_metrics),
              mgd.InputFile(markdups_metrics),
              mgd.OutputFile(metrics, extensions=['.yaml']), sample_id),
        kwargs={
            'main_dtypes': dtypes()['metrics'],
            'insert_dtypes': dtypes()['insert_metrics']
        })

    return workflow
コード例 #12
0
def align_sample_split(fastq_1,
                       fastq_2,
                       out_file,
                       samtools_flagstat,
                       sample_id,
                       lane_id,
                       sample_info,
                       refdir,
                       picard_mem=2):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    split_size = config.default_params('alignment')['split_size']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_fastq_1',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_1),
            pypeliner.managed.TempOutputFile('read_1', 'split'),
            split_size,
        ),
    )

    workflow.transform(
        name='split_fastq_2',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_2),
            pypeliner.managed.TempOutputFile('read_2', 'split',
                                             axes_origin=[]),
            split_size,
        ),
    )

    workflow.transform(name='align_bwa_mem',
                       axes=('split', ),
                       ctx=helpers.get_default_ctx(
                           memory=8,
                           walltime='16:00',
                           ncpus=8,
                       ),
                       func='wgs.workflows.alignment.tasks.align_bwa_mem',
                       args=(
                           pypeliner.managed.TempInputFile('read_1', 'split'),
                           pypeliner.managed.TempInputFile('read_2', 'split'),
                           ref_genome,
                           pypeliner.managed.TempOutputFile(
                               'aligned.bam', 'split'),
                           '8',
                           sample_info,
                       ),
                       kwargs={
                           'sample_id': sample_id,
                           'lane_id': lane_id,
                       })

    workflow.transform(
        name='sort',
        axes=('split', ),
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.bam_sort',
        args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'),
              pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
              pypeliner.managed.TempSpace('bam_sort_by_split', 'split')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='merge',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='72:00',
        ),
        func="wgs.workflows.alignment.tasks.merge_bams",
        args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'),
              pypeliner.managed.OutputFile(out_file),
              pypeliner.managed.TempSpace('bam_merge_by_split')),
        kwargs={'mem': picard_mem})

    workflow.commandline(
        name='index',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'index', pypeliner.managed.InputFile(out_file),
              pypeliner.managed.OutputFile(out_bai)),
    )

    workflow.commandline(
        name='flagstat',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file),
              '>', pypeliner.managed.OutputFile(samtools_flagstat)),
    )

    return workflow
コード例 #13
0
def create_freebayes_germline_workflow(germline_vcf,
                                       germline_maf,
                                       bam_file,
                                       reference,
                                       reference_vep,
                                       chromosomes,
                                       normal_id,
                                       single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.freebayes.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='freebayes_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.freebayes.tasks.run_freebayes_one_job',
            args=(mgd.TempSpace("run_freebayes_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(bam_file)))
    else:
        workflow.transform(
            name='freebayes',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.freebayes.tasks.run_freebayes_germline',
            args=(mgd.TempOutputFile('freebayes_germline.vcf',
                                     'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(bam_file),
                  mgd.TempSpace('tempdir_freebayes', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('freebayes_germline.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="freebayes_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(germline_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(germline_maf,
                                            extensions=['.tbi', '.csi']),
                             reference_vep,
                         ),
                         kwargs={'normal_id': normal_id})

    return workflow
コード例 #14
0
ファイル: __init__.py プロジェクト: diljotgrewal/wgs
def create_samtools_germline_workflow(germline_vcf,
                                      germline_roh,
                                      bam_file,
                                      reference,
                                      chromosomes,
                                      single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.samtools_germline.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func=
            'wgs.workflows.samtools_germline.tasks.run_samtools_germline_one_job',
            args=(mgd.TempSpace("run_samtools_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(bam_file)),
            kwargs={
                'samtools_docker_image': config.containers('samtools'),
                'vcftools_docker_image': config.containers('vcftools')
            })
    else:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.samtools_germline.tasks.run_samtools_germline',
            args=(mgd.TempOutputFile('germline.vcf.gz', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(bam_file)),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('germline.vcf.gz', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(germline_vcf,
                                          extensions=['.tbi', '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='roh_calling',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.workflows.samtools_germline.tasks.roh_calling',
        args=(mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']),
              mgd.OutputFile(germline_roh)),
        kwargs={'docker_image': config.containers('vcftools')})

    return workflow