Ejemplo n.º 1
0
def realign_bam_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs')))

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    yamldata = yaml.safe_load(open(args['input_yaml']))

    samples = list(yamldata.keys())

    input_bams = {sample: yamldata[sample]['input'] for sample in samples}

    output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt')
    metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name="realign",
        func=realign_bams,
        ctx=helpers.get_default_ctx(),
        args=(
            samples,
            mgd.InputFile("input.bam", 'sample_id', fnames=input_bams,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams,
                           extensions=['.bai', '.tdf'], axes_origin=[]),
            mgd.OutputFile("realigned.txt", 'sample_id', template=metrics,
                           extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar,
                           extensions=['.bai'], axes_origin=[]),
            args['refdir'],
        ),
        kwargs={'single_node': args['single_node']}
    )

    outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id')

    workflow.transform(
        name='generate_meta_files_results',
        func='wgs.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args["out_dir"],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'realignment'}
        }
    )

    pyp.run(workflow)
Ejemplo n.º 2
0
def create_lumpy_workflow(lumpy_vcf,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=False):
    workflow = pypeliner.workflow.Workflow()

    lumpy_job_name = 'run_lumpy'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam)
        normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam')
        normal_split = mgd.TempInputFile('normal.splitters.sorted.bam')
        lumpy_job_name += '_normal'
    else:
        normal_disc = None
        normal_split = None

    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam)
        tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam')
        tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam')
        lumpy_job_name += '_tumour'
    else:
        tumour_disc = None
        tumour_split = None

    if normal_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_normal',
            func=lumpy_preprocess_workflow,
            args=(normal_bam,
                  mgd.TempOutputFile('normal.discordants.sorted.bam'),
                  mgd.TempOutputFile('normal.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    if tumour_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_tumour',
            func=lumpy_preprocess_workflow,
            args=(tumour_bam,
                  mgd.TempOutputFile('tumour.discordants.sorted.bam'),
                  mgd.TempOutputFile('tumour.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    workflow.transform(
        name=lumpy_job_name,
        ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'),
        func='wgs.workflows.lumpy.tasks.run_lumpyexpress',
        args=(mgd.OutputFile(lumpy_vcf),
              config.default_params('breakpoint_calling')['lumpy_paths']),
        kwargs={
            'tumour_bam': tumour_bam,
            'tumour_discordants': tumour_disc,
            'tumour_splitters': tumour_split,
            'normal_bam': normal_bam,
            'normal_discordants': normal_disc,
            'normal_splitters': normal_split,
            'docker_image': config.containers('lumpy')
        })

    return workflow
Ejemplo n.º 3
0
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='48:00',
            disk=400
        ),
        func='wgs.workflows.alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.OutputFile(r1_html),
            mgd.OutputFile(r1_plot),
            mgd.TempSpace('fastqc_R1'),
        ),
        kwargs={
            'docker_image': config.containers("fastqc"),
        }
    )

    workflow.transform(
        name="fastqc_r2",
        func='wgs.workflows.alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='48:00',
            disk=400
        ),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.OutputFile(r2_html),
            mgd.OutputFile(r2_plot),
            mgd.TempSpace('fastqc_R2'),
        ),
        kwargs={
            'docker_image': config.containers('fastqc'),
        }
    )

    return workflow
Ejemplo n.º 4
0
def create_destruct_wgs_workflow(tumour_bam,
                                 normal_bam,
                                 raw_breakpoints,
                                 raw_library,
                                 breakpoints,
                                 library,
                                 reads,
                                 sample_id,
                                 reference,
                                 destruct_refdata,
                                 gtf,
                                 mappability,
                                 single_node=False):

    destruct_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
        'gtf_filename': gtf
    }

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ctx=helpers.get_default_ctx(
                           docker_image=config.containers('destruct'),
                           walltime="48:00",
                       ),
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(destruct_refdata, destruct_config))

    if single_node:
        workflow.transform(
            name='destruct_local',
            ctx=helpers.get_default_ctx(walltime='120:00', disk=800),
            func='wgs.workflows.destruct_wgs.tasks.run_destruct_local',
            args=(
                mgd.TempSpace("destruct_local_temp"),
                mgd.InputFile(tumour_bam),
                mgd.InputFile(normal_bam),
                sample_id,
                mgd.TempOutputFile("raw_breakpoints"),
                mgd.TempOutputFile("raw_library"),
                mgd.OutputFile(reads),
                mgd.TempInputObj("destruct_config"),
                destruct_refdata,
            ),
            kwargs={
                'ncpus': 16,
                'docker_image': config.containers('destruct')
            })
    else:
        workflow.subworkflow(
            name='destruct_parallel',
            ctx=helpers.get_default_ctx(
                docker_image=config.containers('destruct'),
                walltime="48:00",
            ),
            # refers to seperate destruct package
            func='destruct.workflow.create_destruct_workflow',
            args=({
                sample_id: mgd.InputFile(tumour_bam),
                sample_id + 'N': mgd.InputFile(normal_bam)
            }, mgd.TempOutputFile("raw_breakpoints"),
                  mgd.TempOutputFile("raw_library"), mgd.OutputFile(reads),
                  mgd.TempInputObj("destruct_config"), destruct_refdata))

    workflow.commandline(
        name='filter_annotate_breakpoints',
        ctx=helpers.get_default_ctx(docker_image=config.containers('destruct'),
                                    memory=8,
                                    walltime='8:00'),
        args=(
            'filter_annotate_breakpoints.py',
            '--breakpoints',
            mgd.TempInputFile("raw_breakpoints"),
            '--library',
            mgd.TempInputFile("raw_library"),
            '--control_ids',
            sample_id + 'N',
            '--out_breakpoints',
            mgd.TempOutputFile("filter_annotate_breakpoints_output"),
            '--out_library',
            mgd.TempOutputFile("library"),
        ))

    workflow.transform(
        name='mappability_annotate_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func='wgs.workflows.destruct_wgs.flag_mappability.main',
        args=(
            mgd.TempInputFile("filter_annotate_breakpoints_output"),
            mgd.TempOutputFile("breakpoints"),
            mappability,
        ))

    workflow.transform(
        name='finalize_raw_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("raw_breakpoints"),
            mgd.OutputFile(raw_breakpoints, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_raw_library',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("raw_library"),
            mgd.OutputFile(raw_library, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("breakpoints"),
            mgd.OutputFile(breakpoints, extensions=['.yaml']),
        ),
    )

    workflow.transform(name='finalize_library',
                       ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
                       func="wgs.utils.csvutils.finalize_csv",
                       args=(
                           mgd.TempInputFile("library"),
                           mgd.OutputFile(library, extensions=['.yaml']),
                       ))

    return workflow
Ejemplo n.º 5
0
def create_hmmcopy_workflow(
    bam_file,
    sample_id,
    bias_pdf,
    correction_pdf,
    hmmcopy_pdf,
    hmmcopy_table,
    pygenes_table,
    chromosomes,
    map_wig,
    gc_wig,
    pygenes_gtf,
):

    cn_params = config.default_params()['copynumber_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='hmmcopy_readcounter',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='2:00',
                       ),
                       func='wgs.workflows.hmmcopy.tasks.hmmcopy_readcounter',
                       args=(
                           mgd.InputFile(bam_file, extensions=['.bai']),
                           mgd.TempOutputFile('infile.wig'),
                           chromosomes,
                           cn_params['readcounter'],
                       ))

    workflow.transform(name='calc_corr',
                       func='wgs.workflows.hmmcopy.tasks.calc_corr',
                       args=(
                           mgd.TempInputFile('infile.wig'),
                           mgd.TempOutputFile('infile_copy.txt'),
                           mgd.TempOutputFile('infile_copy.obj'),
                           gc_wig,
                           map_wig,
                           cn_params['map_cutoff'],
                       ),
                       kwargs={'docker_image': config.containers('hmmcopy')})

    workflow.transform(name='run_hmmcopy',
                       func='wgs.workflows.hmmcopy.tasks.run_hmmcopy',
                       args=(
                           mgd.TempInputFile('infile_copy.obj'),
                           mgd.TempInputFile('infile_copy.txt'),
                           mgd.TempOutputFile('hmmcopy_res.obj'),
                           mgd.TempOutputFile('hmmcopy_segments.txt'),
                           mgd.OutputFile(hmmcopy_table),
                           sample_id,
                           cn_params['hmmcopy_params'],
                       ),
                       kwargs={'docker_image': config.containers('hmmcopy')})

    workflow.transform(name='plot_hmm',
                       func='wgs.workflows.hmmcopy.tasks.plot_hmm',
                       args=(
                           mgd.TempInputFile('infile_copy.obj'),
                           mgd.TempInputFile('hmmcopy_res.obj'),
                           mgd.TempSpace('correction_plots_dir'),
                           mgd.TempSpace('hmmcopy_plots_dir'),
                           mgd.OutputFile(bias_pdf),
                           mgd.OutputFile(correction_pdf),
                           mgd.OutputFile(hmmcopy_pdf),
                       ),
                       kwargs={'docker_image': config.containers('hmmcopy')})

    workflow.transform(name='annot_hmm',
                       func='wgs.workflows.hmmcopy.tasks.annot_hmm',
                       args=(
                           mgd.TempInputFile('hmmcopy_segments.txt'),
                           mgd.OutputFile(pygenes_table),
                           pygenes_gtf,
                       ))

    return workflow
Ejemplo n.º 6
0
def create_samtools_germline_workflow(germline_vcf,
                                      germline_roh,
                                      bam_file,
                                      reference,
                                      chromosomes,
                                      single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.samtools_germline.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func=
            'wgs.workflows.samtools_germline.tasks.run_samtools_germline_one_job',
            args=(mgd.TempSpace("run_samtools_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(bam_file)),
            kwargs={
                'samtools_docker_image': config.containers('samtools'),
                'vcftools_docker_image': config.containers('vcftools')
            })
    else:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.samtools_germline.tasks.run_samtools_germline',
            args=(mgd.TempOutputFile('germline.vcf.gz', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(bam_file)),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('germline.vcf.gz', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(germline_vcf,
                                          extensions=['.tbi', '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='roh_calling',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.workflows.samtools_germline.tasks.roh_calling',
        args=(mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']),
              mgd.OutputFile(germline_roh)),
        kwargs={'docker_image': config.containers('vcftools')})

    return workflow
Ejemplo n.º 7
0
def create_titan_workflow(
        tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs,
        parsed, plots, tar_outputs, museq_vcf,
        sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf,
        single_node=None
):
    cn_params = config.default_params('copynumber_calling')

    chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']]

    targets = mgd.InputFile(targets) if targets else None

    ctx = {'docker_image': config.containers('wgs')}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.titan.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='2:00', ),
        ret=mgd.OutputChunks('interval'),
        args=(
            reference,
            chromosomes,
        ),
        kwargs={'size': cn_params['split_size']}
    )

    if single_node:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='96:00',
                ncpus=8),
            func='wgs.utils.museq_utils.run_museq_one_job',
            args=(
                mgd.TempSpace("run_museq_temp"),
                mgd.OutputFile(museq_vcf),
                reference,
                mgd.InputChunks('interval'),
                cn_params['museq_params'],
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'museq_docker_image': config.containers('mutationseq'),
                'vcftools_docker_image': config.containers('vcftools')
            }
        )
    else:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00'),
            axes=('interval',),
            func='wgs.utils.museq_utils.run_museq',
            args=(
                mgd.TempOutputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq.log', 'interval'),
                reference,
                mgd.InputInstance('interval'),
                cn_params['museq_params']
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'docker_image': config.containers('mutationseq')
            }
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='4:00', ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.OutputFile(museq_vcf),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')}
        )

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.convert_museq_vcf2counts',
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            het_positions,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_correctreads_wig',
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            gc_wig,
            map_wig,
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='run_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='72:00',
            ncpus='8'),
        func='wgs.workflows.titan.tasks.run_titan',
        args=(
            mgd.TempInputFile('museq_postprocess.txt'),
            mgd.TempInputFile('correct_reads.txt'),
            mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy'),
            sample_id,
            map_wig,
            cn_params['titan_params'],
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan'), 'threads': '8'}
    )

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00', ),
        func='wgs.workflows.titan.tasks.plot_titan',
        args=(
            mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'),
            mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy')
        ),
        kwargs={
            'chromosomes': chromosomes,
            'docker_image': config.containers('titan'),
        },
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_cnsegments_titan',
        args=(
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
            sample_id,
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.annot_pygenes',
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            pygenes_gtf,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.parse_titan_data',
        args=(
            mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'),
        ),
    )

    # select optimal solution
    workflow.transform(
        name="select_optimal_solution",
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.select_optimal_solution",
        args=(
            chunks,
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(segs, extensions=['.yaml']),
            mgd.OutputFile(igv_segs, extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
            mgd.OutputFile(outfile, extensions=['.yaml']),
            mgd.OutputFile(parsed, extensions=['.yaml']),
            mgd.OutputFile(plots),
        )
    )

    workflow.transform(
        name='tar_all_data',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.tar_all_data",
        args=(
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(tar_outputs),
            mgd.TempSpace("titan_all_parameters_data"),
            chunks
        )
    )

    return workflow
Ejemplo n.º 8
0
def call_germlines_only(samples,
                        normals,
                        museq_ss_vcf,
                        samtools_germline_vcf,
                        roh_calls,
                        museq_single_pdf,
                        refdir,
                        single_node=False):
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])
    samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid])
                                  for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(
            mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
            mgd.OutputFile('museq_single_pdf',
                           'sample_id',
                           fnames=museq_single_pdf),
            paths_refdir['reference'],
            chromosomes,
        ),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv", 'sample_id', fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_anno.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germline_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    return workflow
Ejemplo n.º 9
0
def call_variants(samples,
                  somatic_calls,
                  somatic_snpeff,
                  somatic_ma,
                  somatic_ids,
                  indel_calls,
                  indel_snpeff,
                  indel_ma,
                  indel_ids,
                  germline_calls,
                  germline_snpeff,
                  germline_ma,
                  germline_ids,
                  tumours,
                  normals,
                  museq_vcf,
                  museq_ss_vcf,
                  samtools_germlines_vcf,
                  roh_calls,
                  strelka_snv_vcf,
                  strelka_indel_vcf,
                  museq_paired_pdf,
                  museq_single_pdf,
                  refdir,
                  single_node=False,
                  is_exome=False):
    strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid])
                            for sampid in samples])
    strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid])
                              for sampid in samples])
    museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples])
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    samtools_germlines_vcf = dict([(sampid, samtools_germlines_vcf[sampid])
                                   for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid])
                             for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])

    somatic_calls = dict([(sampid, somatic_calls[sampid])
                          for sampid in samples])
    somatic_snpeff = dict([(sampid, somatic_snpeff[sampid])
                           for sampid in samples])
    somatic_ma = dict([(sampid, somatic_ma[sampid]) for sampid in samples])
    somatic_ids = dict([(sampid, somatic_ids[sampid]) for sampid in samples])

    indel_calls = dict([(sampid, indel_calls[sampid]) for sampid in samples])
    indel_snpeff = dict([(sampid, indel_snpeff[sampid]) for sampid in samples])
    indel_ma = dict([(sampid, indel_ma[sampid]) for sampid in samples])
    indel_ids = dict([(sampid, indel_ids[sampid]) for sampid in samples])

    germline_calls = dict([(sampid, germline_calls[sampid])
                           for sampid in samples])
    germline_snpeff = dict([(sampid, germline_snpeff[sampid])
                            for sampid in samples])
    germline_ma = dict([(sampid, germline_ma[sampid]) for sampid in samples])
    germline_ids = dict([(sampid, germline_ids[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_paired",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_paired_pdf',
                             'sample_id',
                             fnames=museq_paired_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
        })

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_single_pdf',
                             'sample_id',
                             fnames=museq_single_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv.gz", 'sample_id',
                             fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="strelka",
        func='wgs.workflows.strelka.create_strelka_workflow',
        axes=('sample_id', ),
        args=(mgd.InputFile('normal_bam',
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai']),
              mgd.InputFile('tumour_bam',
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai']),
              mgd.TempOutputFile('strelka_indel.vcf.gz', 'sample_id'),
              mgd.TempOutputFile('strelka_snv.vcf.gz', 'sample_id'),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'single_node': single_node,
            'is_exome': is_exome
        },
    )

    workflow.subworkflow(
        name="annotate_paired_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_vcf), paths_refdir['snpeff_config'],
              paths_refdir['mutation_assessor'], paths_refdir['dbsnp'],
              paths_refdir['thousand_genomes'], paths_refdir['cosmic'],
              paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_ann.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germlines_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_snv_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka_indel",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_indel.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_indel_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_indel_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="consensus_calling",
        func=
        'wgs.workflows.variant_calling_consensus.create_consensus_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile("museq_germlines_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_ss_vcf),
            mgd.InputFile("museq_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_vcf),
            mgd.InputFile("strelka_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_snv_vcf),
            mgd.InputFile("strelka_indel_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_indel_vcf),
            mgd.OutputFile('somatic_csv', 'sample_id', fnames=somatic_calls),
            mgd.OutputFile('somatic_snpeff',
                           'sample_id',
                           fnames=somatic_snpeff),
            mgd.OutputFile('somatic_ma', 'sample_id', fnames=somatic_ma),
            mgd.OutputFile('somatic_ids', 'sample_id', fnames=somatic_ids),
            mgd.OutputFile('indel_csv', 'sample_id', fnames=indel_calls),
            mgd.OutputFile('indel_snpeff', 'sample_id', fnames=indel_snpeff),
            mgd.OutputFile('indel_ma', 'sample_id', fnames=indel_ma),
            mgd.OutputFile('indel_ids', 'sample_id', fnames=indel_ids),
            mgd.OutputFile('germline_csv', 'sample_id', fnames=germline_calls),
            mgd.OutputFile('germline_snpeff',
                           'sample_id',
                           fnames=germline_snpeff),
            mgd.OutputFile('germline_ma', 'sample_id', fnames=germline_ma),
            mgd.OutputFile('germline_ids', 'sample_id', fnames=germline_ids),
            refdir,
        ),
    )

    return workflow
Ejemplo n.º 10
0
def collect_bam_metrics(
        bam, markdups_metrics, sample_id, refdir,
        metrics, picard_insert_metrics, picard_insert_pdf,
        flagstat_metrics, picard_gc_metrics, picard_gc_summary,
        picard_gc_pdf, picard_wgs_metrics, bam_tdf
):
    '''
    calculates bam metrics in bams
    1. picard insert metrics
    2. picard GC metrics
    3. picard wgs metrics
    4. fastqc metrics

    :param config: config containing docker 
    images for metrics
    :param bams: sample:bam dictionary
    :param metrics_csv: output csv containing
        metrics
    :param single_node:
    '''

    ref_genome = config.refdir_data(refdir)['paths']['reference']

    picard_wgs_params = config.default_params('alignment')['picard_wgs_params']

    reftype = config.refdir_data(refdir)['params']['reference_type']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="calc_picard_insert_metrics",
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='72:00',
            disk=400
        ),
        func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics',
        args=(
            mgd.InputFile(bam),
            mgd.OutputFile(flagstat_metrics),
            mgd.OutputFile(picard_insert_metrics),
            mgd.OutputFile(picard_insert_pdf),
            mgd.TempSpace('picard_insert'),
        ),
        kwargs={
            'picard_docker': config.containers('picard'),
            'samtools_docker': config.containers('samtools'),
            'mem': '8G'
        }
    )

    workflow.transform(
        name="calc_picard_gc_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='72:00',
            disk=400
        ),
        args=(
            mgd.InputFile(bam),
            ref_genome,
            mgd.OutputFile(picard_gc_metrics),
            mgd.OutputFile(picard_gc_summary),
            mgd.OutputFile(picard_gc_pdf),
            mgd.TempSpace('picard_gc')
        ),
        kwargs={'docker_image': config.containers('picard'), 'mem': '8G'}
    )

    workflow.transform(
        name="calc_picard_wgs_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='72:00',
            disk=400
        ),
        args=(
            mgd.InputFile(bam),
            ref_genome,
            mgd.OutputFile(picard_wgs_metrics),
            picard_wgs_params,
            mgd.TempSpace('picard_wgs')
        ),
        kwargs={'docker_image': config.containers('picard'), 'mem': '8G'}
    )

    workflow.transform(
        name='igvtools_tdf',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
            # docker_image=config.containers('igvtools')
        ),
        func='wgs.workflows.alignment.tasks.get_igvtools_count',
        args=(
            pypeliner.managed.InputFile(bam),
            pypeliner.managed.OutputFile(bam_tdf),
            reftype
        ),
        kwargs={'docker_image': config.containers('igvtools')}

    )

    workflow.transform(
        name='collect_metrics',
        func='wgs.workflows.alignment.tasks.bam_collect_all_metrics',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00',
            disk=400
        ),
        args=(
            mgd.InputFile(flagstat_metrics),
            mgd.InputFile(picard_insert_metrics),
            mgd.InputFile(picard_wgs_metrics),
            mgd.InputFile(markdups_metrics),
            mgd.OutputFile(metrics, extensions=['.yaml']),
            sample_id
        ),
        kwargs={
            'main_dtypes': dtypes()['metrics'],
            'insert_dtypes': dtypes()['insert_metrics']
        }
    )

    return workflow
Ejemplo n.º 11
0
def align_sample_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    split_size = config.default_params('alignment')['split_size']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_fastq_1',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_1),
            pypeliner.managed.TempOutputFile('read_1', 'split'),
            split_size,
        ),
    )

    workflow.transform(
        name='split_fastq_2',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_2),
            pypeliner.managed.TempOutputFile('read_2', 'split', axes_origin=[]),
            split_size,
        ),
    )

    workflow.transform(
        name='align_bwa_mem',
        axes=('split',),
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='16:00',
            ncpus=8,
        ),
        func='wgs.workflows.alignment.tasks.align_bwa_mem',
        args=(
            pypeliner.managed.TempInputFile('read_1', 'split'),
            pypeliner.managed.TempInputFile('read_2', 'split'),
            ref_genome,
            pypeliner.managed.TempOutputFile('aligned.bam', 'split'),
            '8',
            sample_info,
        ),
        kwargs={
            'sample_id': sample_id,
            'lane_id': lane_id,
            'docker_image': config.containers('bwa')
        }
    )

    workflow.transform(
        name='sort',
        axes=('split',),
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.bam_sort',
        args=(
            pypeliner.managed.TempInputFile('aligned.bam', 'split'),
            pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
        ),
        kwargs={
            'docker_image': config.containers('samtools'),
        }
    )

    workflow.transform(
        name='merge',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='72:00',
        ),
        func="wgs.workflows.alignment.tasks.merge_bams",
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'picard_docker_image': config.containers('picard'),
            'samtools_docker_image': config.containers('samtools')
        }
    )

    workflow.commandline(
        name='index',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
            docker_image=config.containers('samtools')
        ),
        args=(
            'samtools',
            'index',
            pypeliner.managed.InputFile(out_file),
            pypeliner.managed.OutputFile(out_bai)
        ),
    )

    workflow.commandline(
        name='flagstat',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
            docker_image=config.containers('samtools')
        ),
        args=(
            'samtools',
            'flagstat',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(samtools_flagstat)
        ),
    )

    return workflow
Ejemplo n.º 12
0
def align_sample_no_split(fastq_1, fastq_2, out_file, samtools_flagstat, sample_id, lane_id, sample_info, refdir):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='align_bwa_mem',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='48:00',
            ncpus='8',
            disk=300
        ),
        func='wgs.workflows.alignment.tasks.align_bwa_mem',
        args=(
            pypeliner.managed.InputFile(fastq_1),
            pypeliner.managed.InputFile(fastq_2),
            ref_genome,
            pypeliner.managed.TempOutputFile('aligned.bam'),
            '8',
            sample_info,
        ),
        kwargs={
            'sample_id': sample_id,
            'lane_id': lane_id,
            'docker_image': config.containers('bwa')
        }
    )

    workflow.transform(
        name='sort',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='48:00',
            ncpus='8',
            disk=300
        ),
        func='wgs.workflows.alignment.tasks.bam_sort',
        args=(
            pypeliner.managed.TempInputFile('aligned.bam'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'docker_image': config.containers('picard'),
            'threads': '8',
        }
    )

    workflow.transform(
        name='index_and_flagstat',
        func='wgs.workflows.alignment.tasks.index_and_flagstat',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
            disk=200
        ),
        args=(
            pypeliner.managed.InputFile(out_file),
            pypeliner.managed.OutputFile(out_bai),
            pypeliner.managed.OutputFile(samtools_flagstat)
        ),
        kwargs={'docker_image': config.containers('samtools')}
    )

    return workflow
Ejemplo n.º 13
0
def align_samples(
        fastqs_r1,
        fastqs_r2,
        bam_outputs,
        metrics_outputs,
        metrics_tar,
        bam_tdf,
        sample_info,
        refdir,
        single_node=False
):
    if single_node:
        align_func = align_sample_no_split
    else:
        align_func = align_sample_split

    if not isinstance(bam_outputs, dict):
        samples = sorted(set([v[0] for v in fastqs_r1.keys()]))
        bam_outputs = {sample: bam_outputs[sample] for sample in samples}
        metrics_outputs = {sample: metrics_outputs[sample] for sample in samples}
        metrics_tar = {sample: metrics_tar[sample] for sample in samples}
        bam_tdf = {sample: bam_tdf[sample] for sample in samples}


    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.TempOutputObj('sampleinfo', 'sample_id', axes_origin=[]),
        value=sample_info
    )

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(
        name='fastqc_workflow',
        func=fastqc_workflow,
        axes=('sample_id', 'lane_id'),
        args=(
            mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1),
            mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2),
            mgd.TempOutputFile('fastqc_R1.html', 'sample_id', 'lane_id'),
            mgd.TempOutputFile('fastqc_R1.pdf', 'sample_id', 'lane_id'),
            mgd.TempOutputFile('fastqc_R2.html', 'sample_id', 'lane_id'),
            mgd.TempOutputFile('fastqc_R2.pdf', 'sample_id', 'lane_id'),
        )
    )

    workflow.subworkflow(
        name='align_samples',
        func=align_func,
        axes=('sample_id', 'lane_id'),
        args=(
            mgd.InputFile('input.r1.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r1),
            mgd.InputFile('input.r2.fastq.gz', 'sample_id', 'lane_id', fnames=fastqs_r2),
            mgd.TempOutputFile('aligned_lanes.bam', 'sample_id', 'lane_id'),
            mgd.TempOutputFile('samtools_flagstat.txt', 'sample_id', 'lane_id'),
            mgd.InputInstance("sample_id"),
            mgd.InputInstance("lane_id"),
            mgd.TempInputObj('sampleinfo', 'sample_id'),
            refdir
        )
    )

    workflow.transform(
        name='merge_tumour_lanes',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='24:00',
            disk=400
        ),
        func="wgs.workflows.alignment.tasks.merge_bams",
        axes=('sample_id',),
        args=(
            mgd.TempInputFile('aligned_lanes.bam', 'sample_id', 'lane_id'),
            mgd.TempOutputFile('merged_lanes.bam', 'sample_id', extensions=['.bai']),
        ),
        kwargs={
            'picard_docker_image': config.containers('picard'),
            'samtools_docker_image': config.containers('samtools')
        }
    )

    workflow.transform(
        name='markdups_reheader',
        ctx=helpers.get_default_ctx(
            memory=12,
            walltime='24:00',
            ncpus=1,
            disk=300
        ),
        func='wgs.workflows.alignment.tasks.markdups',
        axes=('sample_id',),
        args=(
            mgd.TempInputFile('merged_lanes.bam', 'sample_id', extensions=['.bai']),
            mgd.OutputFile('markdups.bam', 'sample_id', fnames=bam_outputs, extensions=['.bai']),
            mgd.TempOutputFile('markdups_metrics', 'sample_id'),
            pypeliner.managed.TempSpace("temp_markdups", "sample_id"),
        ),
        kwargs={
            'picard_docker': config.containers('picard'),
            'samtools_docker': config.containers('samtools'),
            'mem': '8G',
            'reheader': True,
        }
    )

    workflow.subworkflow(
        name='metrics',
        func=collect_bam_metrics,
        axes=('sample_id',),
        args=(
            mgd.InputFile('markdups.bam', 'sample_id', fnames=bam_outputs, extensions=['.bai']),
            mgd.TempInputFile('markdups_metrics', 'sample_id'),
            mgd.InputInstance('sample_id'),
            refdir,
            mgd.OutputFile('metrics_output', 'sample_id', fnames=metrics_outputs, extensions=['.yaml']),
            mgd.TempOutputFile('picard_insert_metrics.txt', 'sample_id'),
            mgd.TempOutputFile('picard_insert_metrics.pdf', 'sample_id'),
            mgd.TempOutputFile('flagstat_metrics.txt', 'sample_id'),
            mgd.TempOutputFile('picard_gc_metrics.txt', 'sample_id'),
            mgd.TempOutputFile('picard_gc_summary.txt', 'sample_id'),
            mgd.TempOutputFile('picard_gc.pdf', 'sample_id'),
            mgd.TempOutputFile('picard_wgs_metrics.txt', 'sample_id'),
            mgd.OutputFile('out.bam.tdf', 'sample_id', fnames=bam_tdf),
        )
    )

    workflow.transform(
        name='tar',
        func='wgs.utils.helpers.make_tar_from_files',
        axes=('sample_id',),
        args=(
            mgd.OutputFile('metrics_tar', 'sample_id', fnames=metrics_tar),
            [
                mgd.TempInputFile('picard_insert_metrics.txt', 'sample_id'),
                mgd.TempInputFile('picard_insert_metrics.pdf', 'sample_id'),
                mgd.TempInputFile('flagstat_metrics.txt', 'sample_id'),
                mgd.TempInputFile('picard_gc_metrics.txt', 'sample_id'),
                mgd.TempInputFile('picard_gc_summary.txt', 'sample_id'),
                mgd.TempInputFile('picard_gc.pdf', 'sample_id'),
                mgd.TempInputFile('picard_wgs_metrics.txt', 'sample_id'),
                mgd.TempInputFile('markdups_metrics', 'sample_id'),
                mgd.TempInputFile('fastqc_R1.html', 'sample_id', 'lane_id'),
                mgd.TempInputFile('fastqc_R1.pdf', 'sample_id', 'lane_id'),
                mgd.TempInputFile('fastqc_R2.html', 'sample_id', 'lane_id'),
                mgd.TempInputFile('fastqc_R2.pdf', 'sample_id', 'lane_id'),
            ],
            mgd.TempSpace('wgs_metrics')
        )
    )
    return workflow
Ejemplo n.º 14
0
def breakpoint_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
    input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz')
    destruct_library = os.path.join(sv_outdir, '{sample_id}_destruct_library.csv.gz')
    destruct_raw_breakpoints = os.path.join(sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz')
    destruct_raw_library = os.path.join(sv_outdir, '{sample_id}_destruct_raw_library.csv.gz')
    destruct_reads = os.path.join(sv_outdir, '{sample_id}_destruct_reads.csv.gz')
    lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir, '{sample_id}_filtered_consensus_calls.csv.gz')

    single_node = args['single_node']

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow(
        ctx=helpers.get_default_ctx(docker_image=config.containers('wgs'))
    )

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='destruct',
        func=destruct_wgs.create_destruct_wgs_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints),
            mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library),
            mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints),
            mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library),
            mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads),
            mgd.InputInstance('sample_id'),
            refdir_paths['reference'],
            refdir_paths['refdata_destruct'],
            refdir_paths['gtf'],
            refdir_paths['blacklist_destruct']
        ),
        kwargs={'single_node': single_node}
    )

    workflow.subworkflow(
        name='lumpy',
        func=lumpy.create_lumpy_workflow,
        axes=('sample_id',),
        args=(
            mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf),
        ),
        kwargs={
            'tumour_bam': mgd.InputFile(
                "tumour.bam", 'sample_id', fnames=tumours,
                extensions=['.bai'], axes_origin=[]),
            'normal_bam': mgd.InputFile(
                "normal.bam", 'sample_id', fnames=normals,
                extensions=['.bai'], axes_origin=[]),
            'single_node': single_node
        },
    )

    workflow.subworkflow(
        name="consensus_calling",
        func=breakpoint_calling_consensus.create_consensus_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints),
            mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf),
            mgd.OutputFile('consensus_calls', 'sample_id', template=parsed_csv),
            chromosomes
        ),
    )

    filenames = [
        destruct_breakpoints,
        destruct_library,
        destruct_raw_breakpoints,
        destruct_reads,
        lumpy_vcf,
        parsed_csv
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(
        name='generate_meta_files_results',
        func=helpers.generate_and_upload_metadata,
        args=(
            sys.argv[0:],
            args["out_dir"],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'breakpoint_calling'}
        }
    )

    pyp.run(workflow)
Ejemplo n.º 15
0
def postprocessing_workflow(args):

    yamldata = yaml.safe_load(open(args['input_yaml']))

    samples = list(yamldata.keys())

    normals = {sample: yamldata[sample]['normal_bam'] for sample in samples}
    tumours = {sample: yamldata[sample]['tumour_bam'] for sample in samples}

    titan = {sample: yamldata[sample]['titan'] for sample in samples}
    remixt = {sample: yamldata[sample]['remixt'] for sample in samples}
    breakpoints_consensus = {
        sample: yamldata[sample]['breakpoints_consensus']
        for sample in samples
    }
    roh = {sample: yamldata[sample]['roh'] for sample in samples}
    germline_calls = {
        sample: yamldata[sample]['germline_calls']
        for sample in samples
    }
    somatic_calls = {
        sample: yamldata[sample]['somatic_calls']
        for sample in samples
    }

    out_dir = args['out_dir']

    meta_yaml = os.path.join(out_dir, 'pipeline_metadata.yaml')
    input_yaml_blob = os.path.join(out_dir, 'input.yaml')

    circos_plot_remixt = os.path.join(out_dir, '{sample_id}',
                                      '{sample_id}_circos_remixt.pdf')
    circos_plot_titan = os.path.join(out_dir, '{sample_id}',
                                     '{sample_id}_circos_titan.pdf')

    genome_wide_plot = os.path.join(out_dir, '{sample_id}',
                                    '{sample_id}_genome_wide.pdf')

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(name="postprocessing",
                         func=postprocessing.create_postprocessing_workflow,
                         ctx=helpers.get_default_ctx(),
                         axes=('sample_id', ),
                         args=(
                             mgd.InputFile('normal.bam',
                                           'sample_id',
                                           fnames=normals),
                             mgd.InputFile('tumour.bam',
                                           'sample_id',
                                           fnames=tumours),
                             titan,
                             remixt,
                             breakpoints_consensus,
                             roh,
                             germline_calls,
                             somatic_calls,
                             mgd.OutputFile('circos_plot_remixt.pdf',
                                            'sample_id',
                                            template=circos_plot_remixt),
                             mgd.OutputFile('circos_plot_titan.pdf',
                                            'sample_id',
                                            template=circos_plot_titan),
                             mgd.OutputFile('genome_wide_plot.pdf',
                                            'sample_id',
                                            template=genome_wide_plot),
                             args['refdir'],
                             mgd.InputInstance('sample_id'),
                         ),
                         kwargs={'single_node': args['single_node']})

    outputted_filenames = helpers.expand_list(
        [circos_plot_remixt, circos_plot_titan, genome_wide_plot], samples,
        "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'postprocessing'
                           }
                       })

    pyp.run(workflow)
Ejemplo n.º 16
0
def lumpy_preprocess_workflow(bamfile,
                              discordants_sorted_bam,
                              splitters_sorted_bam,
                              single_node=False):
    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='run_lumpy_preprocess',
            ctx=helpers.get_default_ctx(memory=10, walltime='96:00', disk=300),
            func='wgs.workflows.lumpy.tasks.run_lumpy_preprocess',
            args=(mgd.InputFile(bamfile),
                  mgd.OutputFile(discordants_sorted_bam),
                  mgd.OutputFile(splitters_sorted_bam),
                  mgd.TempSpace("lumpy_preprocess_temp"),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={
                'lumpy_docker_image': config.containers('lumpy'),
                'samtools_docker_image': config.containers('samtools')
            })
    else:
        workflow.transform(
            name='run_samtools_view_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_view',
            args=(
                mgd.InputFile(bamfile),
                mgd.TempOutputFile('normal.discordants.unsorted.bam'),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_lumpy_extract_split_reads_bwamem_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func=
            'wgs.workflows.lumpy.tasks.run_lumpy_extract_split_reads_bwamem',
            args=(mgd.InputFile(bamfile),
                  mgd.TempOutputFile('normal.splitters.unsorted.bam'),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={'docker_image': config.containers('lumpy')})

        workflow.transform(
            name='run_samtools_sort_discordants_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.discordants.unsorted.bam'),
                mgd.OutputFile(discordants_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_samtools_sort_splitters_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.splitters.unsorted.bam'),
                mgd.OutputFile(splitters_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

    return workflow
Ejemplo n.º 17
0
def copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    run_hmmcopy = args['hmmcopy']
    run_titan = args['titan']
    run_remixt = args['remixt']

    if not run_hmmcopy and not run_titan and not run_remixt:
        run_hmmcopy = True
        run_titan = True
        run_remixt = True

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = list(tumours.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')

    titan_outfile = os.path.join(titan_raw_dir,
                                 '{sample_id}_titan_markers.csv.gz')
    titan_params = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_params.csv.gz')
    titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz')
    titan_igv_segs = os.path.join(titan_raw_dir,
                                  '{sample_id}_titan_igv_segs.seg')
    titan_parsed = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_parsed.csv.gz')
    titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf')
    titan_tar_outputs = os.path.join(titan_raw_dir,
                                     '{sample_id}_data_all_parameters.tar.gz')
    museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf')

    hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal')
    normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    normal_correction_table = os.path.join(
        hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt')
    normal_pygenes = os.path.join(hmmcopy_normal_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour')
    tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    tumour_correction_table = os.path.join(
        hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt')
    tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}')
    remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5')

    remixt_brk_cn_csv = os.path.join(remixt_outdir,
                                     '{sample_id}_remixt_brk_cn.csv.gz')
    remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz')
    remixt_minor_modes_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz')
    remixt_mix_csv = os.path.join(remixt_outdir,
                                  '{sample_id}_remixt_mix.csv.gz')
    remixt_read_depth_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz')
    remixt_stats_csv = os.path.join(remixt_outdir,
                                    '{sample_id}_remixt_stats.csv.gz')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    if run_remixt:
        workflow.subworkflow(
            name='remixt',
            func=remixt.create_remixt_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai']),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai']),
                mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints),
                mgd.InputInstance('sample_id'),
                mgd.OutputFile('remixt.h5',
                               'sample_id',
                               template=remixt_outfile),
                mgd.OutputFile('remixt_brk_cn.csv',
                               'sample_id',
                               template=remixt_brk_cn_csv),
                mgd.OutputFile('remixt_cn.csv',
                               'sample_id',
                               template=remixt_cn_csv),
                mgd.OutputFile('remixt_minor_modes.csv',
                               'sample_id',
                               template=remixt_minor_modes_csv),
                mgd.OutputFile('remixt_mix.csv',
                               'sample_id',
                               template=remixt_mix_csv),
                mgd.OutputFile('remixt_read_depth.csv',
                               'sample_id',
                               template=remixt_read_depth_csv),
                mgd.OutputFile('remixt_stats.csv',
                               'sample_id',
                               template=remixt_stats_csv),
                refdir_paths['refdata_remixt'],
                refdir_paths['reference'],
            ),
            kwargs={'single_node': args['single_node']})

    if run_titan:
        workflow.subworkflow(name='titan',
                             func=titan.create_titan_workflow,
                             axes=('sample_id', ),
                             args=(
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai']),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai']),
                                 mgd.InputFile("target_list",
                                               'sample_id',
                                               fnames=targets),
                                 mgd.OutputFile('outfile',
                                                'sample_id',
                                                template=titan_outfile),
                                 mgd.OutputFile('params',
                                                'sample_id',
                                                template=titan_params),
                                 mgd.OutputFile('segs',
                                                'sample_id',
                                                template=titan_segs),
                                 mgd.OutputFile('igv_segs',
                                                'sample_id',
                                                template=titan_igv_segs),
                                 mgd.OutputFile('parsed',
                                                'sample_id',
                                                template=titan_parsed),
                                 mgd.OutputFile('plots',
                                                'sample_id',
                                                template=titan_plots),
                                 mgd.OutputFile('tar_outputs',
                                                'sample_id',
                                                template=titan_tar_outputs),
                                 mgd.OutputFile('museq.vcf',
                                                'sample_id',
                                                template=museq_vcf),
                                 mgd.InputInstance('sample_id'),
                                 refdir_paths['reference'],
                                 chromosomes,
                                 refdir_paths['het_positions_titan'],
                                 refdir_paths['map_wig'],
                                 refdir_paths['gc_wig'],
                                 refdir_paths['gtf'],
                             ),
                             kwargs={'single_node': args['single_node']})

    if run_hmmcopy:
        workflow.subworkflow(
            name='hmmcopy_normal',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('normal_bias',
                                 'sample_id',
                                 template=normal_bias_pdf),
                  mgd.OutputFile('normal_correction',
                                 'sample_id',
                                 template=normal_correction_pdf),
                  mgd.OutputFile('normal_hmmcopy',
                                 'sample_id',
                                 template=normal_hmmcopy_pdf),
                  mgd.OutputFile('normal_correction_table',
                                 'sample_id',
                                 template=normal_correction_table),
                  mgd.OutputFile('normal_pygenes',
                                 'sample_id',
                                 template=normal_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

        workflow.subworkflow(
            name='hmmcopy_tumour',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("tumour.bam",
                                'sample_id',
                                fnames=tumours,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('tumour_bias',
                                 'sample_id',
                                 template=tumour_bias_pdf),
                  mgd.OutputFile('tumour_correction',
                                 'sample_id',
                                 template=tumour_correction_pdf),
                  mgd.OutputFile('tumour_hmmcopy',
                                 'sample_id',
                                 template=tumour_hmmcopy_pdf),
                  mgd.OutputFile('tumour_correction_table',
                                 'sample_id',
                                 template=tumour_correction_table),
                  mgd.OutputFile('tumour_pygenes',
                                 'sample_id',
                                 template=tumour_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

    filenames = []
    if run_titan:
        filenames += [
            titan_outfile,
            titan_params,
            titan_segs,
            titan_igv_segs,
            titan_parsed,
            titan_plots,
            titan_tar_outputs,
            museq_vcf,
        ]
    if run_hmmcopy:
        filenames += [
            normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf,
            normal_correction_table, normal_pygenes, tumour_bias_pdf,
            tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table,
            tumour_pygenes
        ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'copynumber_calling'
                           }
                       })

    pyp.run(workflow)
Ejemplo n.º 18
0
def get_coverage_data(input_bam, output, refdir, single_node=False):
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    chrom_sizes = config.refdir_data(refdir)['paths']['chrom_sizes']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
            ),
            kwargs={'docker_image': config.containers('samtools')},
        )

    else:

        workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                # mgd.InputInstance('chromosome'),
                # refdir_paths['reference'],
            ),
            kwargs={'docker_image': config.containers('samtools')})
        workflow.transform(name='merge_data',
                           func='wgs.utils.csvutils.concatenate_csv',
                           ctx=helpers.get_default_ctx(memory=5),
                           args=(
                               mgd.TempInputFile('per_interval.txt',
                                                 'chromosome',
                                                 axes_origin=[]),
                               mgd.OutputFile(output),
                           ))

    return workflow
Ejemplo n.º 19
0
def variant_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    var_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_museq_single_annotated.vcf.gz')

    samtools_germline_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz')
    samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv')

    strelka_snv_vcf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz')
    museq_paired_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_single_museqportrait.pdf')

    somatic_csv = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic.csv.gz')
    somatic_snpeff = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz')
    somatic_ma = os.path.join(var_dir, '{sample_id}',
                              '{sample_id}_consensus_somatic_ma.csv.gz')
    somatic_ids = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic_ids.csv.gz')

    indel_csv = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel.csv.gz')
    indel_snpeff = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_indel_snpeff.csv.gz')
    indel_ma = os.path.join(var_dir, '{sample_id}',
                            '{sample_id}_indel_ma.csv.gz')
    indel_ids = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel_ids.csv.gz')

    germline_csv = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline.csv.gz')
    germline_snpeff = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_germline_snpeff.csv.gz')
    germline_ma = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_germline_ma.csv.gz')
    germline_ids = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline_ids.csv.gz')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if not all(tumours.values()):
        workflow.subworkflow(
            name='variant_calling',
            func=call_germlines_only,
            args=(samples,
                  mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai'],
                                axes_origin=[]),
                  mgd.OutputFile('museq_ss',
                                 'sample_id',
                                 template=museq_ss_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_germline',
                                 'sample_id',
                                 template=samtools_germline_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_roh',
                                 'sample_id',
                                 template=samtools_roh,
                                 axes_origin=[]),
                  mgd.OutputFile('museq_single_pdf',
                                 'sample_id',
                                 template=museq_single_pdf,
                                 axes_origin=[]), args['refdir']),
            kwargs={'single_node': args['single_node']})
    else:
        workflow.subworkflow(name='variant_calling',
                             func=call_variants,
                             args=(
                                 samples,
                                 mgd.OutputFile('somatic_csv',
                                                'sample_id',
                                                template=somatic_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_snpeff',
                                                'sample_id',
                                                template=somatic_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ma',
                                                'sample_id',
                                                template=somatic_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ids',
                                                'sample_id',
                                                template=somatic_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_csv',
                                                'sample_id',
                                                template=indel_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_snpeff',
                                                'sample_id',
                                                template=indel_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ma',
                                                'sample_id',
                                                template=indel_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ids',
                                                'sample_id',
                                                template=indel_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_csv',
                                                'sample_id',
                                                template=germline_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_snpeff',
                                                'sample_id',
                                                template=germline_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ma',
                                                'sample_id',
                                                template=germline_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ids',
                                                'sample_id',
                                                template=germline_ids,
                                                axes_origin=[]),
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.OutputFile('museq',
                                                'sample_id',
                                                template=museq_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_ss',
                                                'sample_id',
                                                template=museq_ss_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('samtools_germline',
                                                'sample_id',
                                                template=samtools_germline_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('roh_calls',
                                                'sample_id',
                                                template=samtools_roh,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_snv',
                                                'sample_id',
                                                template=strelka_snv_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_indel',
                                                'sample_id',
                                                template=strelka_indel_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_paired_pdf',
                                                'sample_id',
                                                template=museq_paired_pdf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_single_pdf',
                                                'sample_id',
                                                template=museq_single_pdf,
                                                axes_origin=[]),
                                 args['refdir'],
                             ),
                             kwargs={
                                 'single_node': args['single_node'],
                                 'is_exome': args['is_exome'],
                             })

        filenames = [
            somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv,
            indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff,
            germline_ma, germline_ids, museq_vcf, museq_ss_vcf,
            strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf,
            museq_single_pdf
        ]

        outputted_filenames = helpers.expand_list(filenames, samples,
                                                  "sample_id")

        workflow.transform(
            name='generate_meta_files_results',
            func='wgs.utils.helpers.generate_and_upload_metadata',
            args=(sys.argv[0:], args['out_dir'], outputted_filenames,
                  mgd.OutputFile(meta_yaml)),
            kwargs={
                'input_yaml_data': helpers.load_yaml(args['input_yaml']),
                'input_yaml': mgd.OutputFile(input_yaml_blob),
                'metadata': {
                    'type': 'variant_calling'
                }
            })

    pyp.run(workflow)
Ejemplo n.º 20
0
def create_postprocessing_workflow(normal_bam,
                                   tumour_bam,
                                   titan,
                                   remixt,
                                   breakpoints_consensus,
                                   roh,
                                   germline_calls,
                                   somatic_calls,
                                   circos_plot_remixt,
                                   circos_plot_titan,
                                   genome_wide_plot,
                                   refdir,
                                   sample_id,
                                   single_node=False):

    refdir_paths = config.refdir_data(refdir)['paths']
    refdir_params = config.refdir_data(refdir)['params']

    ideogram = refdir_paths["ideogram"]

    titan_calls = titan[sample_id]
    remixt_calls = remixt[sample_id]
    sv_calls = breakpoints_consensus[sample_id]
    roh_calls = roh[sample_id]
    germline_vcf = germline_calls[sample_id]
    somatic_calls = somatic_calls[sample_id]
    chromosomes = refdir_params['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(name='coverage_normal_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(normal_bam),
                             mgd.TempOutputFile('normal_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.subworkflow(name='coverage_tumour_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(tumour_bam),
                             mgd.TempOutputFile('tumour_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.transform(
        name='parse_roh',
        ctx=helpers.get_default_ctx(memory=5),
        func="wgs.workflows.postprocessing.tasks.parse_roh",
        args=(
            mgd.InputFile(roh_calls),
            mgd.TempOutputFile("ROH_parsed"),
        ),
    )

    if remixt_calls:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
            kwargs={
                "remixt": mgd.InputFile(remixt_calls),
                "remixt_label": sample_id
            })
        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={
                'docker_image': config.containers('circos'),
                'remixt_calls': mgd.InputFile(remixt_calls)
            },
        )
    else:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
        )

        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={'docker_image': config.containers('circos')})

    return workflow
Ejemplo n.º 21
0
def create_museq_workflow(snv_vcf,
                          museqportrait_pdf,
                          reference,
                          chromosomes,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam':
                               tumour_bam,
                               'normal_bam':
                               normal_bam,
                               'museq_docker_image':
                               config.containers('mutationseq'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(
                               mgd.TempOutputFile('museq.vcf', 'interval'),
                               mgd.TempOutputFile('museq.log', 'interval'),
                               reference,
                               mgd.InputInstance('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                               'docker_image':
                               config.containers('mutationseq'),
                           })

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(snv_vcf, extensions=['.tbi',
                                                               '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'docker_image': config.containers('mutationseq'),
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    return workflow
Ejemplo n.º 22
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            reference,
                            chromosomes,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
        kwargs={'docker_image': config.containers('strelka')})

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome':
                               is_exome,
                               'strelka_docker_image':
                               config.containers('strelka'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
            kwargs={'docker_image': config.containers('strelka')},
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
                'docker_image': config.containers('strelka')
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
            kwargs={'docker_image': config.containers('vcftools')})

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='filter_vcf_indel',
                       func='wgs.workflows.strelka.tasks.filter_vcf',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz',
                                             extensions=['.tbi', '.csi']),
                           mgd.OutputFile(indel_vcf_file,
                                          extensions=['.tbi', '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='filter_vcf_snv',
                       func='wgs.workflows.strelka.tasks.filter_vcf',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz',
                                             extensions=['.tbi', '.csi']),
                           mgd.OutputFile(snv_vcf_file,
                                          extensions=['.tbi', '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    return workflow
Ejemplo n.º 23
0
def create_remixt_workflow(
    tumour_path,
    normal_path,
    breakpoints,
    sample_id,
    remixt_results_filename,
    remixt_brk_cn_csv,
    remixt_cn_csv,
    remixt_minor_modes_csv,
    remixt_mix_csv,
    remixt_read_depth_csv,
    remixt_stats_csv,
    remixt_refdata,
    reference,
    single_node=False,
):
    ctx = {'docker_image': config.containers('wgs')}

    params = config.default_params('copynumber_calling')['remixt']

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    remixt_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
    }

    if breakpoints is None:
        workflow.setobj(
            obj=mgd.TempOutputObj('emptybreakpoints'),
            value=[],
        )

        workflow.transform(
            name='write_empty_breakpoints',
            func='wgs.workflows.remixt.tasks.write_empty_breakpoints',
            args=(
                mgd.TempInputObj('emptybreakpoints'),
                mgd.TempOutputFile('filtered_breakpoints.csv'),
            ),
        )

    else:
        workflow.transform(
            name='filter_breakpoints',
            func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints',
            ctx=helpers.get_default_ctx(memory=4, walltime='4:00'),
            args=(mgd.InputFile(breakpoints),
                  mgd.TempOutputFile('filtered_breakpoints.csv'),
                  params['min_num_reads']))

    if single_node:
        workflow.transform(
            name='remixt',
            func='wgs.workflows.remixt.tasks.run_remixt_local',
            ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8),
            args=(
                mgd.TempSpace("remixt_temp"),
                mgd.TempInputFile('filtered_breakpoints.csv'),
                mgd.InputFile(tumour_path, extensions=['.bai']),
                mgd.InputFile(normal_path, extensions=['.bai']),
                sample_id,
                mgd.OutputFile(remixt_results_filename),
                mgd.TempSpace('remixt_raw_dir'),
                remixt_config,
                remixt_refdata,
            ),
        )
    else:
        workflow.subworkflow(name='remixt',
                             func="remixt.workflow.create_remixt_bam_workflow",
                             ctx={
                                 'docker_image': config.containers('remixt'),
                                 'walltime': '48:00'
                             },
                             args=(
                                 mgd.TempInputFile('filtered_breakpoints.csv'),
                                 {
                                     sample_id:
                                     mgd.InputFile(tumour_path,
                                                   extensions=['.bai']),
                                     sample_id + 'N':
                                     mgd.InputFile(normal_path,
                                                   extensions=['.bai'])
                                 },
                                 {
                                     sample_id:
                                     mgd.OutputFile(remixt_results_filename)
                                 },
                                 mgd.TempSpace('remixt_raw_dir'),
                                 remixt_config,
                                 remixt_refdata,
                             ),
                             kwargs={
                                 'normal_id': sample_id + 'N',
                             })

    workflow.transform(
        name='parse_remixt',
        func='wgs.workflows.remixt.tasks.parse_remixt_file',
        args=(mgd.InputFile(remixt_results_filename), [
            mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']),
        ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth',
            '/stats'], mgd.TempSpace('tempdir_parse')))

    return workflow
Ejemplo n.º 24
0
def alignment_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])
    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    outputs_tdf = os.path.join(outdir, '{sample_id}', '{sample_id}.bam.tdf')
    metrics_output = os.path.join(outdir, '{sample_id}',
                                  '{sample_id}_metrics.csv')
    metrics_tar = os.path.join(outdir, '{sample_id}',
                               '{sample_id}_metrics.tar.gz')

    samples = list(inputs.keys())
    fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None)

    sample_info = helpers.get_sample_info(inputs)

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(name="align_samples",
                         func=alignment.align_samples,
                         args=(mgd.InputFile('input.r1.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r1),
                               mgd.InputFile('input.r2.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r2),
                               mgd.Template('output.bam',
                                            'sample_id',
                                            template=outputs),
                               mgd.Template('metrics.txt',
                                            'sample_id',
                                            template=metrics_output),
                               mgd.Template('metrics.tar',
                                            'sample_id',
                                            template=metrics_tar),
                               mgd.Template('output.bam.tdf',
                                            'sample_id',
                                            template=outputs_tdf), sample_info,
                               args['refdir']),
                         kwargs={'single_node': args['single_node']})

    outputted_filenames = helpers.expand_list(
        [outputs, metrics_output, metrics_tar], samples, "sample_id")
    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], outdir, outputted_filenames,
                             mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data': inputs,
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'alignment'
                           }
                       })

    pyp.run(workflow)