Example #1
0
def fastqc_workflow(fastq_r1, fastq_r2, r1_html, r1_plot, r2_html, r2_plot):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='wgs.workflows.alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.OutputFile(r1_html),
            mgd.OutputFile(r1_plot),
            mgd.TempSpace('fastqc_R1'),
        ),
    )

    workflow.transform(
        name="fastqc_r2",
        func='wgs.workflows.alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.OutputFile(r2_html),
            mgd.OutputFile(r2_plot),
            mgd.TempSpace('fastqc_R2'),
        ),
    )

    return workflow
Example #2
0
def realign_bam_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(docker_image=config.containers('wgs')))

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    yamldata = yaml.safe_load(open(args['input_yaml']))

    samples = list(yamldata.keys())

    input_bams = {sample: yamldata[sample]['input'] for sample in samples}

    output_bams = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    metrics = os.path.join(outdir, '{sample_id}', '{sample_id}.txt')
    metrics_tar = os.path.join(outdir, '{sample_id}', '{sample_id}.tar')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name="realign",
        func=realign_bams,
        ctx=helpers.get_default_ctx(),
        args=(
            samples,
            mgd.InputFile("input.bam", 'sample_id', fnames=input_bams,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.bam", 'sample_id', template=output_bams,
                           extensions=['.bai', '.tdf'], axes_origin=[]),
            mgd.OutputFile("realigned.txt", 'sample_id', template=metrics,
                           extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile("realigned.tar", 'sample_id', template=metrics_tar,
                           extensions=['.bai'], axes_origin=[]),
            args['refdir'],
        ),
        kwargs={'single_node': args['single_node']}
    )

    outputted_filenames = helpers.expand_list([output_bams, metrics, metrics_tar], samples, 'sample_id')

    workflow.transform(
        name='generate_meta_files_results',
        func='wgs.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args["out_dir"],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'realignment'}
        }
    )

    pyp.run(workflow)
Example #3
0
def align_sample_no_split(fastq_1,
                          fastq_2,
                          out_file,
                          samtools_flagstat,
                          sample_id,
                          lane_id,
                          sample_info,
                          refdir,
                          picard_mem=None):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='align_bwa_mem',
                       ctx=helpers.get_default_ctx(memory=8,
                                                   walltime='48:00',
                                                   ncpus='8',
                                                   disk=300),
                       func='wgs.workflows.alignment.tasks.align_bwa_mem',
                       args=(
                           pypeliner.managed.InputFile(fastq_1),
                           pypeliner.managed.InputFile(fastq_2),
                           ref_genome,
                           pypeliner.managed.TempOutputFile('aligned.bam'),
                           '8',
                           sample_info,
                       ),
                       kwargs={
                           'sample_id': sample_id,
                           'lane_id': lane_id,
                       })

    workflow.transform(name='sort',
                       ctx=helpers.get_default_ctx(memory=8,
                                                   walltime='48:00',
                                                   ncpus='8',
                                                   disk=300),
                       func='wgs.workflows.alignment.tasks.bam_sort',
                       args=(pypeliner.managed.TempInputFile('aligned.bam'),
                             pypeliner.managed.OutputFile(out_file),
                             pypeliner.managed.TempSpace('bam_sort_tempdir')),
                       kwargs={
                           'threads': '8',
                           'mem': '{}G'.format(picard_mem)
                       })

    workflow.transform(
        name='index_and_flagstat',
        func='wgs.workflows.alignment.tasks.index_and_flagstat',
        ctx=helpers.get_default_ctx(memory=4, walltime='24:00', disk=200),
        args=(pypeliner.managed.InputFile(out_file),
              pypeliner.managed.OutputFile(out_bai),
              pypeliner.managed.OutputFile(samtools_flagstat)),
    )

    return workflow
Example #4
0
def create_somatic_consensus_workflow(
    mutect_snv_vcf,
    strelka_snv_vcf,
    strelka_indel_vcf,
    museq_snv_vcf,
    consensus_maf,
    chromosomes,
    reference_vep,
    normal_id,
    tumour_id,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='snv_consensus',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.somatic_calling_consensus.consensus.main',
        args=(
            mgd.InputFile(museq_snv_vcf),
            mgd.InputFile(strelka_snv_vcf),
            mgd.InputFile(mutect_snv_vcf),
            mgd.InputFile(strelka_indel_vcf),
            mgd.TempOutputFile('consensus.vcf'),
            mgd.TempOutputFile('counts.csv'),
            chromosomes,
        ),
    )

    workflow.subworkflow(name="consensus_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.TempInputFile('consensus.vcf'),
                             mgd.TempOutputFile('consensus.maf'),
                             reference_vep,
                         ),
                         kwargs={
                             'normal_id': normal_id,
                             'tumour_id': tumour_id
                         })

    workflow.transform(
        name='maf_counts',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.somatic_calling_consensus.tasks.update_maf_counts',
        args=(
            mgd.TempInputFile('consensus.maf'),
            mgd.TempInputFile('counts.csv'),
            mgd.OutputFile(consensus_maf),
        ))

    return workflow
Example #5
0
def create_consensus_workflow(
        destruct_breakpoints,
        lumpy_vcf,
        output,
        chromosomes
):

    params = config.default_params('breakpoint_calling')
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_lumpy',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task',
        args=(
            mgd.InputFile(lumpy_vcf),
            mgd.TempOutputFile('lumpy.csv'),
            params["parse_lumpy"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='parse_destruct',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task',
        args=(
            mgd.InputFile(destruct_breakpoints),
            mgd.TempOutputFile('destruct.csv'),
            params["parse_destruct"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='consensus_breakpoint_calling',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls',
        args=(
            mgd.TempInputFile('destruct.csv'),
            mgd.TempInputFile('lumpy.csv'),
            mgd.OutputFile(output, extensions=['.yaml']),
            params['consensus']
        ),
    )

    return workflow
Example #6
0
def create_lumpy_workflow(lumpy_vcf,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=False):
    workflow = pypeliner.workflow.Workflow()

    lumpy_job_name = 'run_lumpy'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam)
        normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam')
        normal_split = mgd.TempInputFile('normal.splitters.sorted.bam')
        lumpy_job_name += '_normal'
    else:
        normal_disc = None
        normal_split = None

    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam)
        tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam')
        tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam')
        lumpy_job_name += '_tumour'
    else:
        tumour_disc = None
        tumour_split = None

    if normal_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_normal',
            func=lumpy_preprocess_workflow,
            args=(normal_bam,
                  mgd.TempOutputFile('normal.discordants.sorted.bam'),
                  mgd.TempOutputFile('normal.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    if tumour_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_tumour',
            func=lumpy_preprocess_workflow,
            args=(tumour_bam,
                  mgd.TempOutputFile('tumour.discordants.sorted.bam'),
                  mgd.TempOutputFile('tumour.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    workflow.transform(
        name=lumpy_job_name,
        ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'),
        func='wgs.workflows.lumpy.tasks.run_lumpyexpress',
        args=(mgd.OutputFile(lumpy_vcf),
              config.default_params('breakpoint_calling')['lumpy_paths']),
        kwargs={
            'tumour_bam': tumour_bam,
            'tumour_discordants': tumour_disc,
            'tumour_splitters': tumour_split,
            'normal_bam': normal_bam,
            'normal_discordants': normal_disc,
            'normal_splitters': normal_split,
            'docker_image': config.containers('lumpy')
        })

    return workflow
Example #7
0
def create_svaba_workflow(
    tumour_bam,
    normal_bam,
    svaba_vcf,
    reference,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='run_svaba',
        ctx=helpers.get_default_ctx(memory=10,
                                    walltime='72:00',
                                    ncpus='8',
                                    disk=300),
        func='wgs.workflows.svaba.tasks.run_svaba',
        args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam),
              mgd.TempOutputFile('germline.indel.vcf.gz'),
              mgd.TempOutputFile('germline.sv.vcf.gz'),
              mgd.TempOutputFile('somatic.indel.vcf.gz'),
              mgd.OutputFile(svaba_vcf),
              mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference,
              mgd.TempSpace('svaba_tempdir_full')),
        kwargs={
            'ncores': 8,
        })

    return workflow
Example #8
0
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints,
           circos_plot_remixt, circos_plot_titan):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='prep_titan',
        func='wgs_qc_utils.reader.read_titan.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(titan_calls),
            mgd.TempOutputFile("titan_prepped"),
        )
    )

    workflow.transform(
        name='prep_remixt',
        func='wgs_qc_utils.reader.read_remixt.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(remixt_calls),
            sample_id,
            mgd.TempOutputFile("remixt_prepped"),
        )
    )
    workflow.transform(
        name='circos_plot',
        func='wgs.workflows.sample_qc.tasks.circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.TempInputFile("titan_prepped"),
            mgd.TempInputFile("remixt_prepped"),
            sample_id,
            breakpoints,
            mgd.OutputFile(circos_plot_remixt),
            mgd.OutputFile(circos_plot_titan),
            mgd.TempSpace("circos")
        )
    )

    return workflow
Example #9
0
def create_sample_qc_workflow_normal_only(
        sample_id,
        refdir,
        normal_bam,
        roh,
        germline_calls,
        genome_wide_plot,
        normal_coverage,
        chromosomes,
        bins,
        mapping_qual_threshold,
        single_node=False
):

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(
        name='coverage_normal_data',
        func=get_coverage_data,
        args=(
            mgd.InputFile(normal_bam),
            mgd.OutputFile(normal_coverage),
            refdir,
            chromosomes,
            mapping_qual_threshold,
            bins,
        ),
        kwargs={'single_node': single_node}
    )



    workflow.transform(
        name='generate_genome_wide_plot',
        ctx=helpers.get_default_ctx(
            memory=10,
        ),
        func="wgs.workflows.sample_qc.tasks.genome_wide",
        args=(
            sample_id,
            mgd.InputFile(roh),
            mgd.InputFile(germline_calls),
            mgd.InputFile(normal_coverage),
            chromosomes,
            mgd.OutputFile(genome_wide_plot),
        ),
        kwargs={"normal_only":True}
    )

    return workflow
Example #10
0
def lumpy_preprocess_workflow(bamfile,
                              discordants_sorted_bam,
                              splitters_sorted_bam,
                              single_node=False):
    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='run_lumpy_preprocess',
            ctx=helpers.get_default_ctx(memory=10, walltime='96:00', disk=300),
            func='wgs.workflows.lumpy.tasks.run_lumpy_preprocess',
            args=(mgd.InputFile(bamfile),
                  mgd.OutputFile(discordants_sorted_bam),
                  mgd.OutputFile(splitters_sorted_bam),
                  mgd.TempSpace("lumpy_preprocess_temp"),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={
                'lumpy_docker_image': config.containers('lumpy'),
                'samtools_docker_image': config.containers('samtools')
            })
    else:
        workflow.transform(
            name='run_samtools_view_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_view',
            args=(
                mgd.InputFile(bamfile),
                mgd.TempOutputFile('normal.discordants.unsorted.bam'),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_lumpy_extract_split_reads_bwamem_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func=
            'wgs.workflows.lumpy.tasks.run_lumpy_extract_split_reads_bwamem',
            args=(mgd.InputFile(bamfile),
                  mgd.TempOutputFile('normal.splitters.unsorted.bam'),
                  config.default_params('breakpoint_calling')['lumpy_paths']),
            kwargs={'docker_image': config.containers('lumpy')})

        workflow.transform(
            name='run_samtools_sort_discordants_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.discordants.unsorted.bam'),
                mgd.OutputFile(discordants_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='run_samtools_sort_splitters_normal',
            ctx=helpers.get_default_ctx(
                memory=10,
                walltime='24:00',
            ),
            func='wgs.workflows.lumpy.tasks.run_samtools_sort',
            args=(
                mgd.TempInputFile('normal.splitters.unsorted.bam'),
                mgd.OutputFile(splitters_sorted_bam),
            ),
            kwargs={'docker_image': config.containers('samtools')})

    return workflow
Example #11
0
def get_coverage_data(
        input_bam, output, refdir, chromosomes,
        mapping_qual, bins, single_node=False
):
    reference = config.refdir_data(refdir)['paths']['reference']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,

            ),
        )

    else:
        workflow.setobj(
            obj=mgd.OutputChunks('chromosome'),
            value=chromosomes
        )
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,
            ),
        )

        workflow.transform(
            name='merge_data',
            func='wgs.utils.csvutils.concatenate_csv',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]),
                mgd.OutputFile(output),
            )
        )

    return workflow
Example #12
0
def get_coverage_data(input_bam, output, refdir, single_node=False):
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    chrom_sizes = config.refdir_data(refdir)['paths']['chrom_sizes']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
            ),
            kwargs={'docker_image': config.containers('samtools')},
        )

    else:

        workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                # mgd.InputInstance('chromosome'),
                # refdir_paths['reference'],
            ),
            kwargs={'docker_image': config.containers('samtools')})
        workflow.transform(name='merge_data',
                           func='wgs.utils.csvutils.concatenate_csv',
                           ctx=helpers.get_default_ctx(memory=5),
                           args=(
                               mgd.TempInputFile('per_interval.txt',
                                                 'chromosome',
                                                 axes_origin=[]),
                               mgd.OutputFile(output),
                           ))

    return workflow
Example #13
0
def create_titan_workflow(
        tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs,
        parsed, plots, tar_outputs, museq_vcf,
        sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf,
        single_node=None
):
    cn_params = config.default_params('copynumber_calling')

    chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']]

    targets = mgd.InputFile(targets) if targets else None

    ctx = {'docker_image': config.containers('wgs')}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.titan.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='2:00', ),
        ret=mgd.OutputChunks('interval'),
        args=(
            reference,
            chromosomes,
        ),
        kwargs={'size': cn_params['split_size']}
    )

    if single_node:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='96:00',
                ncpus=8),
            func='wgs.utils.museq_utils.run_museq_one_job',
            args=(
                mgd.TempSpace("run_museq_temp"),
                mgd.OutputFile(museq_vcf),
                reference,
                mgd.InputChunks('interval'),
                cn_params['museq_params'],
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'museq_docker_image': config.containers('mutationseq'),
                'vcftools_docker_image': config.containers('vcftools')
            }
        )
    else:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00'),
            axes=('interval',),
            func='wgs.utils.museq_utils.run_museq',
            args=(
                mgd.TempOutputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq.log', 'interval'),
                reference,
                mgd.InputInstance('interval'),
                cn_params['museq_params']
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'docker_image': config.containers('mutationseq')
            }
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='4:00', ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.OutputFile(museq_vcf),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')}
        )

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.convert_museq_vcf2counts',
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            het_positions,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_correctreads_wig',
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            gc_wig,
            map_wig,
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='run_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='72:00',
            ncpus='8'),
        func='wgs.workflows.titan.tasks.run_titan',
        args=(
            mgd.TempInputFile('museq_postprocess.txt'),
            mgd.TempInputFile('correct_reads.txt'),
            mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy'),
            sample_id,
            map_wig,
            cn_params['titan_params'],
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan'), 'threads': '8'}
    )

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00', ),
        func='wgs.workflows.titan.tasks.plot_titan',
        args=(
            mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'),
            mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy')
        ),
        kwargs={
            'chromosomes': chromosomes,
            'docker_image': config.containers('titan'),
        },
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_cnsegments_titan',
        args=(
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
            sample_id,
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.annot_pygenes',
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            pygenes_gtf,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.parse_titan_data',
        args=(
            mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'),
        ),
    )

    # select optimal solution
    workflow.transform(
        name="select_optimal_solution",
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.select_optimal_solution",
        args=(
            chunks,
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(segs, extensions=['.yaml']),
            mgd.OutputFile(igv_segs, extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
            mgd.OutputFile(outfile, extensions=['.yaml']),
            mgd.OutputFile(parsed, extensions=['.yaml']),
            mgd.OutputFile(plots),
        )
    )

    workflow.transform(
        name='tar_all_data',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.tar_all_data",
        args=(
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(tar_outputs),
            mgd.TempSpace("titan_all_parameters_data"),
            chunks
        )
    )

    return workflow
Example #14
0
def create_museq_workflow(snv_vcf,
                          museqportrait_pdf,
                          reference,
                          chromosomes,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam':
                               tumour_bam,
                               'normal_bam':
                               normal_bam,
                               'museq_docker_image':
                               config.containers('mutationseq'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(
                               mgd.TempOutputFile('museq.vcf', 'interval'),
                               mgd.TempOutputFile('museq.log', 'interval'),
                               reference,
                               mgd.InputInstance('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                               'docker_image':
                               config.containers('mutationseq'),
                           })

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(snv_vcf, extensions=['.tbi',
                                                               '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'docker_image': config.containers('mutationseq'),
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    return workflow
Example #15
0
def variant_calling_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    var_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_museq_single_annotated.vcf.gz')

    samtools_germline_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_samtools_germline.vcf.gz')
    samtools_roh = os.path.join(var_dir, '{sample_id}', '{sample_id}_roh.csv')

    strelka_snv_vcf = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_strelka_indel_annotated.vcf.gz')
    museq_paired_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(var_dir, '{sample_id}',
                                    '{sample_id}_single_museqportrait.pdf')

    somatic_csv = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic.csv.gz')
    somatic_snpeff = os.path.join(
        var_dir, '{sample_id}', '{sample_id}_consensus_somatic_snpeff.csv.gz')
    somatic_ma = os.path.join(var_dir, '{sample_id}',
                              '{sample_id}_consensus_somatic_ma.csv.gz')
    somatic_ids = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_consensus_somatic_ids.csv.gz')

    indel_csv = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel.csv.gz')
    indel_snpeff = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_indel_snpeff.csv.gz')
    indel_ma = os.path.join(var_dir, '{sample_id}',
                            '{sample_id}_indel_ma.csv.gz')
    indel_ids = os.path.join(var_dir, '{sample_id}',
                             '{sample_id}_indel_ids.csv.gz')

    germline_csv = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline.csv.gz')
    germline_snpeff = os.path.join(var_dir, '{sample_id}',
                                   '{sample_id}_germline_snpeff.csv.gz')
    germline_ma = os.path.join(var_dir, '{sample_id}',
                               '{sample_id}_germline_ma.csv.gz')
    germline_ids = os.path.join(var_dir, '{sample_id}',
                                '{sample_id}_germline_ids.csv.gz')

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if not all(tumours.values()):
        workflow.subworkflow(
            name='variant_calling',
            func=call_germlines_only,
            args=(samples,
                  mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai'],
                                axes_origin=[]),
                  mgd.OutputFile('museq_ss',
                                 'sample_id',
                                 template=museq_ss_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_germline',
                                 'sample_id',
                                 template=samtools_germline_vcf,
                                 axes_origin=[]),
                  mgd.OutputFile('samtools_roh',
                                 'sample_id',
                                 template=samtools_roh,
                                 axes_origin=[]),
                  mgd.OutputFile('museq_single_pdf',
                                 'sample_id',
                                 template=museq_single_pdf,
                                 axes_origin=[]), args['refdir']),
            kwargs={'single_node': args['single_node']})
    else:
        workflow.subworkflow(name='variant_calling',
                             func=call_variants,
                             args=(
                                 samples,
                                 mgd.OutputFile('somatic_csv',
                                                'sample_id',
                                                template=somatic_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_snpeff',
                                                'sample_id',
                                                template=somatic_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ma',
                                                'sample_id',
                                                template=somatic_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('somatic_ids',
                                                'sample_id',
                                                template=somatic_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_csv',
                                                'sample_id',
                                                template=indel_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_snpeff',
                                                'sample_id',
                                                template=indel_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ma',
                                                'sample_id',
                                                template=indel_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('indel_ids',
                                                'sample_id',
                                                template=indel_ids,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_csv',
                                                'sample_id',
                                                template=germline_csv,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_snpeff',
                                                'sample_id',
                                                template=germline_snpeff,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ma',
                                                'sample_id',
                                                template=germline_ma,
                                                axes_origin=[]),
                                 mgd.OutputFile('germline_ids',
                                                'sample_id',
                                                template=germline_ids,
                                                axes_origin=[]),
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai'],
                                               axes_origin=[]),
                                 mgd.OutputFile('museq',
                                                'sample_id',
                                                template=museq_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_ss',
                                                'sample_id',
                                                template=museq_ss_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('samtools_germline',
                                                'sample_id',
                                                template=samtools_germline_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('roh_calls',
                                                'sample_id',
                                                template=samtools_roh,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_snv',
                                                'sample_id',
                                                template=strelka_snv_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('strelka_indel',
                                                'sample_id',
                                                template=strelka_indel_vcf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_paired_pdf',
                                                'sample_id',
                                                template=museq_paired_pdf,
                                                axes_origin=[]),
                                 mgd.OutputFile('museq_single_pdf',
                                                'sample_id',
                                                template=museq_single_pdf,
                                                axes_origin=[]),
                                 args['refdir'],
                             ),
                             kwargs={
                                 'single_node': args['single_node'],
                                 'is_exome': args['is_exome'],
                             })

        filenames = [
            somatic_csv, somatic_snpeff, somatic_ma, somatic_ids, indel_csv,
            indel_snpeff, indel_ma, indel_ids, germline_csv, germline_snpeff,
            germline_ma, germline_ids, museq_vcf, museq_ss_vcf,
            strelka_snv_vcf, strelka_indel_vcf, museq_paired_pdf,
            museq_single_pdf
        ]

        outputted_filenames = helpers.expand_list(filenames, samples,
                                                  "sample_id")

        workflow.transform(
            name='generate_meta_files_results',
            func='wgs.utils.helpers.generate_and_upload_metadata',
            args=(sys.argv[0:], args['out_dir'], outputted_filenames,
                  mgd.OutputFile(meta_yaml)),
            kwargs={
                'input_yaml_data': helpers.load_yaml(args['input_yaml']),
                'input_yaml': mgd.OutputFile(input_yaml_blob),
                'metadata': {
                    'type': 'variant_calling'
                }
            })

    pyp.run(workflow)
Example #16
0
def create_annotation_workflow(
    input_vcf,
    annotated_vcf,
    snpeff,
    mutationassessor,
    dbsnp,
    thousand_genomes,
    cosmic,
    mappability,
):
    databases = {
        'snpeff_params': {
            'snpeff_config': snpeff,
        },
        'mutation_assessor_params': {
            'db': mutationassessor
        },
        'dbsnp_params': {
            'db': dbsnp
        },
        'thousandgen_params': {
            'db': thousand_genomes
        },
        'cosmic_params': {
            'db': cosmic
        },
        'mappability_ref': mappability
    }

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='run_snpeff',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.vcf_annotation.tasks.run_snpeff',
        args=(
            mgd.InputFile(input_vcf),
            mgd.TempOutputFile('annotSnpEff.vcf'),
            databases,
        ),
    )

    workflow.transform(
        name='run_mutation_assessor',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='8:00',
        ),
        func='wgs.workflows.vcf_annotation.tasks.run_mutation_assessor',
        args=(
            mgd.TempInputFile('annotSnpEff.vcf'),
            mgd.TempOutputFile('annotMA.vcf'),
            databases,
        ),
    )

    workflow.transform(
        name='run_DBSNP',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.vcf_annotation.tasks.run_DBSNP',
        args=(
            mgd.TempInputFile('annotMA.vcf'),
            mgd.TempOutputFile('flagDBsnp.vcf'),
            databases,
        ),
    )

    workflow.transform(
        name='run_1000gen',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.vcf_annotation.tasks.run_1000gen',
        args=(
            mgd.TempInputFile('flagDBsnp.vcf'),
            mgd.TempOutputFile('flag1000gen.vcf'),
            databases,
        ),
    )

    workflow.transform(
        name='run_cosmic',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.vcf_annotation.tasks.run_cosmic',
        args=(
            mgd.TempInputFile('flag1000gen.vcf'),
            mgd.TempOutputFile('cosmic.vcf'),
            databases,
        ),
    )

    workflow.transform(
        name='low_mappability_flag',
        func='wgs.workflows.vcf_annotation.tasks.flag_low_mappability',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        args=(mgd.TempInputFile('cosmic.vcf'),
              mgd.TempOutputFile('low_mapp.vcf'),
              databases['mappability_ref']),
    ),

    workflow.transform(
        name='finalize',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('low_mapp.vcf'),
            mgd.OutputFile(annotated_vcf, extensions=['.csi', '.tbi']),
        ),
    )

    return workflow
Example #17
0
def create_postprocessing_workflow(normal_bam,
                                   tumour_bam,
                                   titan,
                                   remixt,
                                   breakpoints_consensus,
                                   roh,
                                   germline_calls,
                                   somatic_calls,
                                   circos_plot_remixt,
                                   circos_plot_titan,
                                   genome_wide_plot,
                                   refdir,
                                   sample_id,
                                   single_node=False):

    refdir_paths = config.refdir_data(refdir)['paths']
    refdir_params = config.refdir_data(refdir)['params']

    ideogram = refdir_paths["ideogram"]

    titan_calls = titan[sample_id]
    remixt_calls = remixt[sample_id]
    sv_calls = breakpoints_consensus[sample_id]
    roh_calls = roh[sample_id]
    germline_vcf = germline_calls[sample_id]
    somatic_calls = somatic_calls[sample_id]
    chromosomes = refdir_params['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(name='coverage_normal_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(normal_bam),
                             mgd.TempOutputFile('normal_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.subworkflow(name='coverage_tumour_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(tumour_bam),
                             mgd.TempOutputFile('tumour_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.transform(
        name='parse_roh',
        ctx=helpers.get_default_ctx(memory=5),
        func="wgs.workflows.postprocessing.tasks.parse_roh",
        args=(
            mgd.InputFile(roh_calls),
            mgd.TempOutputFile("ROH_parsed"),
        ),
    )

    if remixt_calls:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
            kwargs={
                "remixt": mgd.InputFile(remixt_calls),
                "remixt_label": sample_id
            })
        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={
                'docker_image': config.containers('circos'),
                'remixt_calls': mgd.InputFile(remixt_calls)
            },
        )
    else:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
        )

        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={'docker_image': config.containers('circos')})

    return workflow
Example #18
0
def collect_bam_metrics(bam,
                        markdups_metrics,
                        sample_id,
                        refdir,
                        metrics,
                        picard_insert_metrics,
                        picard_insert_pdf,
                        flagstat_metrics,
                        picard_gc_metrics,
                        picard_gc_summary,
                        picard_gc_pdf,
                        picard_wgs_metrics,
                        bam_tdf,
                        picard_mem=8):
    '''
    calculates bam metrics in bams
    1. picard insert metrics
    2. picard GC metrics
    3. picard wgs metrics
    4. fastqc metrics

    :param config: config
    images for metrics
    :param bams: sample:bam dictionary
    :param metrics_csv: output csv containing
        metrics
    :param single_node:
    '''

    ref_genome = config.refdir_data(refdir)['paths']['reference']

    picard_wgs_params = config.default_params('alignment')['picard_wgs_params']

    reftype = config.refdir_data(refdir)['params']['reference_type']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="calc_picard_insert_metrics",
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics',
        args=(
            mgd.InputFile(bam),
            mgd.OutputFile(flagstat_metrics),
            mgd.OutputFile(picard_insert_metrics),
            mgd.OutputFile(picard_insert_pdf),
            mgd.TempSpace('picard_insert'),
        ),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_gc_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_gc_metrics),
              mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf),
              mgd.TempSpace('picard_gc')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_wgs_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_wgs_metrics), picard_wgs_params,
              mgd.TempSpace('picard_wgs')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='igvtools_tdf',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.get_igvtools_count',
        args=(pypeliner.managed.InputFile(bam),
              pypeliner.managed.OutputFile(bam_tdf), reftype),
    )

    workflow.transform(
        name='collect_metrics',
        func='wgs.workflows.alignment.tasks.bam_collect_all_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400),
        args=(mgd.InputFile(flagstat_metrics),
              mgd.InputFile(picard_insert_metrics),
              mgd.InputFile(picard_wgs_metrics),
              mgd.InputFile(markdups_metrics),
              mgd.OutputFile(metrics, extensions=['.yaml']), sample_id),
        kwargs={
            'main_dtypes': dtypes()['metrics'],
            'insert_dtypes': dtypes()['insert_metrics']
        })

    return workflow
Example #19
0
def align_sample_split(fastq_1,
                       fastq_2,
                       out_file,
                       samtools_flagstat,
                       sample_id,
                       lane_id,
                       sample_info,
                       refdir,
                       picard_mem=2):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    split_size = config.default_params('alignment')['split_size']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_fastq_1',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_1),
            pypeliner.managed.TempOutputFile('read_1', 'split'),
            split_size,
        ),
    )

    workflow.transform(
        name='split_fastq_2',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_2),
            pypeliner.managed.TempOutputFile('read_2', 'split',
                                             axes_origin=[]),
            split_size,
        ),
    )

    workflow.transform(name='align_bwa_mem',
                       axes=('split', ),
                       ctx=helpers.get_default_ctx(
                           memory=8,
                           walltime='16:00',
                           ncpus=8,
                       ),
                       func='wgs.workflows.alignment.tasks.align_bwa_mem',
                       args=(
                           pypeliner.managed.TempInputFile('read_1', 'split'),
                           pypeliner.managed.TempInputFile('read_2', 'split'),
                           ref_genome,
                           pypeliner.managed.TempOutputFile(
                               'aligned.bam', 'split'),
                           '8',
                           sample_info,
                       ),
                       kwargs={
                           'sample_id': sample_id,
                           'lane_id': lane_id,
                       })

    workflow.transform(
        name='sort',
        axes=('split', ),
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.bam_sort',
        args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'),
              pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
              pypeliner.managed.TempSpace('bam_sort_by_split', 'split')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='merge',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='72:00',
        ),
        func="wgs.workflows.alignment.tasks.merge_bams",
        args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'),
              pypeliner.managed.OutputFile(out_file),
              pypeliner.managed.TempSpace('bam_merge_by_split')),
        kwargs={'mem': picard_mem})

    workflow.commandline(
        name='index',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'index', pypeliner.managed.InputFile(out_file),
              pypeliner.managed.OutputFile(out_bai)),
    )

    workflow.commandline(
        name='flagstat',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file),
              '>', pypeliner.managed.OutputFile(samtools_flagstat)),
    )

    return workflow
Example #20
0
def align_samples(
    fastqs_r1,
    fastqs_r2,
    bam_outputs,
    metrics_outputs,
    metrics_tar,
    bam_tdf,
    sample_info,
    refdir,
    single_node=False,
    picard_mem=8,
):
    if single_node:
        align_func = align_sample_no_split
    else:
        align_func = align_sample_split

    if not isinstance(bam_outputs, dict):
        samples = sorted(set([v[0] for v in fastqs_r1.keys()]))
        bam_outputs = {sample: bam_outputs[sample] for sample in samples}
        metrics_outputs = {
            sample: metrics_outputs[sample]
            for sample in samples
        }
        metrics_tar = {sample: metrics_tar[sample] for sample in samples}
        bam_tdf = {sample: bam_tdf[sample] for sample in samples}

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.TempOutputObj('sampleinfo',
                                          'sample_id',
                                          axes_origin=[]),
                    value=sample_info)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(name='fastqc_workflow',
                         func=fastqc_workflow,
                         axes=('sample_id', 'lane_id'),
                         args=(
                             mgd.InputFile('input.r1.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r1),
                             mgd.InputFile('input.r2.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r2),
                             mgd.TempOutputFile('fastqc_R1.html', 'sample_id',
                                                'lane_id'),
                             mgd.TempOutputFile('fastqc_R1.pdf', 'sample_id',
                                                'lane_id'),
                             mgd.TempOutputFile('fastqc_R2.html', 'sample_id',
                                                'lane_id'),
                             mgd.TempOutputFile('fastqc_R2.pdf', 'sample_id',
                                                'lane_id'),
                         ))

    workflow.subworkflow(name='align_samples',
                         func=align_func,
                         axes=('sample_id', 'lane_id'),
                         args=(mgd.InputFile('input.r1.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r1),
                               mgd.InputFile('input.r2.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r2),
                               mgd.TempOutputFile('aligned_lanes.bam',
                                                  'sample_id', 'lane_id'),
                               mgd.TempOutputFile('samtools_flagstat.txt',
                                                  'sample_id', 'lane_id'),
                               mgd.InputInstance("sample_id"),
                               mgd.InputInstance("lane_id"),
                               mgd.TempInputObj('sampleinfo',
                                                'sample_id'), refdir),
                         kwargs={'picard_mem': picard_mem})

    workflow.transform(name='merge_tumour_lanes',
                       ctx=helpers.get_default_ctx(memory=10,
                                                   walltime='24:00',
                                                   disk=400),
                       func="wgs.workflows.alignment.tasks.merge_bams",
                       axes=('sample_id', ),
                       args=(mgd.TempInputFile('aligned_lanes.bam',
                                               'sample_id', 'lane_id'),
                             mgd.TempOutputFile('merged_lanes.bam',
                                                'sample_id',
                                                extensions=['.bai']),
                             mgd.TempSpace('merge_tumour_lanes_tempdir')),
                       kwargs={'mem': picard_mem})

    workflow.transform(name='markdups',
                       ctx=helpers.get_default_ctx(memory=12,
                                                   walltime='24:00',
                                                   ncpus=1,
                                                   disk=300),
                       func='wgs.workflows.alignment.tasks.markdups',
                       axes=('sample_id', ),
                       args=(
                           mgd.TempInputFile('merged_lanes.bam',
                                             'sample_id',
                                             extensions=['.bai']),
                           mgd.OutputFile('markdups.bam',
                                          'sample_id',
                                          fnames=bam_outputs,
                                          extensions=['.bai']),
                           mgd.TempOutputFile('markdups_metrics', 'sample_id'),
                           pypeliner.managed.TempSpace("temp_markdups",
                                                       "sample_id"),
                       ),
                       kwargs={
                           'mem': '{}G'.format(picard_mem),
                       })

    workflow.subworkflow(name='metrics',
                         func=collect_bam_metrics,
                         axes=('sample_id', ),
                         args=(
                             mgd.InputFile('markdups.bam',
                                           'sample_id',
                                           fnames=bam_outputs,
                                           extensions=['.bai']),
                             mgd.TempInputFile('markdups_metrics',
                                               'sample_id'),
                             mgd.InputInstance('sample_id'),
                             refdir,
                             mgd.OutputFile('metrics_output',
                                            'sample_id',
                                            fnames=metrics_outputs,
                                            extensions=['.yaml']),
                             mgd.TempOutputFile('picard_insert_metrics.txt',
                                                'sample_id'),
                             mgd.TempOutputFile('picard_insert_metrics.pdf',
                                                'sample_id'),
                             mgd.TempOutputFile('flagstat_metrics.txt',
                                                'sample_id'),
                             mgd.TempOutputFile('picard_gc_metrics.txt',
                                                'sample_id'),
                             mgd.TempOutputFile('picard_gc_summary.txt',
                                                'sample_id'),
                             mgd.TempOutputFile('picard_gc.pdf', 'sample_id'),
                             mgd.TempOutputFile('picard_wgs_metrics.txt',
                                                'sample_id'),
                             mgd.OutputFile('out.bam.tdf',
                                            'sample_id',
                                            fnames=bam_tdf),
                         ))

    workflow.transform(
        name='tar',
        func='wgs.utils.helpers.make_tar_from_files',
        axes=('sample_id', ),
        args=(mgd.OutputFile('metrics_tar', 'sample_id', fnames=metrics_tar), [
            mgd.TempInputFile('picard_insert_metrics.txt', 'sample_id'),
            mgd.TempInputFile('picard_insert_metrics.pdf', 'sample_id'),
            mgd.TempInputFile('flagstat_metrics.txt', 'sample_id'),
            mgd.TempInputFile('picard_gc_metrics.txt', 'sample_id'),
            mgd.TempInputFile('picard_gc_summary.txt', 'sample_id'),
            mgd.TempInputFile('picard_gc.pdf', 'sample_id'),
            mgd.TempInputFile('picard_wgs_metrics.txt', 'sample_id'),
            mgd.TempInputFile('markdups_metrics', 'sample_id'),
            mgd.TempInputFile('fastqc_R1.html', 'sample_id', 'lane_id'),
            mgd.TempInputFile('fastqc_R1.pdf', 'sample_id', 'lane_id'),
            mgd.TempInputFile('fastqc_R2.html', 'sample_id', 'lane_id'),
            mgd.TempInputFile('fastqc_R2.pdf', 'sample_id', 'lane_id'),
        ], mgd.TempSpace('wgs_metrics')))
    return workflow
Example #21
0
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv,
                              strelka_indel, somatic_calls, somatic_snpeff,
                              somatic_ma, somatic_ids, indel_calls,
                              indel_snpeff, indel_ma, indel_ids,
                              germline_calls, germline_snpeff, germline_ma,
                              germline_ids, refdir):
    params = config.default_params('variant_calling')
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_museq_germlines',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']),
              mgd.OutputFile(germline_calls, extensions=['.yaml']),
              mgd.OutputFile(germline_snpeff, extensions=['.yaml']),
              mgd.OutputFile(germline_ma, extensions=['.yaml']),
              mgd.OutputFile(germline_ids,
                             extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_germlines")),
    )

    workflow.transform(
        name='parse_strelka_indel',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']),
              mgd.OutputFile(indel_calls, extensions=['.yaml']),
              mgd.OutputFile(indel_snpeff, extensions=['.yaml']),
              mgd.OutputFile(indel_ma, extensions=['.yaml']),
              mgd.OutputFile(indel_ids,
                             extensions=['.yaml']), params["parse_strelka"],
              chromosomes, mgd.TempSpace("tempdir_strelka_indel")),
    )

    workflow.transform(
        name='parse_museq_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ids.csv',
                                 extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")),
    )

    workflow.transform(
        name='parse_strelka_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_snpeff.csv',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']),
              params["parse_strelka"], chromosomes,
              mgd.TempSpace("tempdir_parse_strelka_snv")),
    )

    workflow.transform(
        name='merge_snvs',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_snv.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_calls, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_snpeff',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_snpeff.csv',
                                  extensions=['.yaml']),
                mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_snpeff, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ma',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ma.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ma, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ids',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ids.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ids, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    return workflow
Example #22
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Example #23
0
def call_variants(samples,
                  somatic_calls,
                  somatic_snpeff,
                  somatic_ma,
                  somatic_ids,
                  indel_calls,
                  indel_snpeff,
                  indel_ma,
                  indel_ids,
                  germline_calls,
                  germline_snpeff,
                  germline_ma,
                  germline_ids,
                  tumours,
                  normals,
                  museq_vcf,
                  museq_ss_vcf,
                  samtools_germlines_vcf,
                  roh_calls,
                  strelka_snv_vcf,
                  strelka_indel_vcf,
                  museq_paired_pdf,
                  museq_single_pdf,
                  refdir,
                  single_node=False,
                  is_exome=False):
    strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid])
                            for sampid in samples])
    strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid])
                              for sampid in samples])
    museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples])
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    samtools_germlines_vcf = dict([(sampid, samtools_germlines_vcf[sampid])
                                   for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid])
                             for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])

    somatic_calls = dict([(sampid, somatic_calls[sampid])
                          for sampid in samples])
    somatic_snpeff = dict([(sampid, somatic_snpeff[sampid])
                           for sampid in samples])
    somatic_ma = dict([(sampid, somatic_ma[sampid]) for sampid in samples])
    somatic_ids = dict([(sampid, somatic_ids[sampid]) for sampid in samples])

    indel_calls = dict([(sampid, indel_calls[sampid]) for sampid in samples])
    indel_snpeff = dict([(sampid, indel_snpeff[sampid]) for sampid in samples])
    indel_ma = dict([(sampid, indel_ma[sampid]) for sampid in samples])
    indel_ids = dict([(sampid, indel_ids[sampid]) for sampid in samples])

    germline_calls = dict([(sampid, germline_calls[sampid])
                           for sampid in samples])
    germline_snpeff = dict([(sampid, germline_snpeff[sampid])
                            for sampid in samples])
    germline_ma = dict([(sampid, germline_ma[sampid]) for sampid in samples])
    germline_ids = dict([(sampid, germline_ids[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_paired",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_paired_pdf',
                             'sample_id',
                             fnames=museq_paired_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
        })

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_single_pdf',
                             'sample_id',
                             fnames=museq_single_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv.gz", 'sample_id',
                             fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="strelka",
        func='wgs.workflows.strelka.create_strelka_workflow',
        axes=('sample_id', ),
        args=(mgd.InputFile('normal_bam',
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai']),
              mgd.InputFile('tumour_bam',
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai']),
              mgd.TempOutputFile('strelka_indel.vcf.gz', 'sample_id'),
              mgd.TempOutputFile('strelka_snv.vcf.gz', 'sample_id'),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'single_node': single_node,
            'is_exome': is_exome
        },
    )

    workflow.subworkflow(
        name="annotate_paired_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_vcf), paths_refdir['snpeff_config'],
              paths_refdir['mutation_assessor'], paths_refdir['dbsnp'],
              paths_refdir['thousand_genomes'], paths_refdir['cosmic'],
              paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_ann.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germlines_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_snv_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka_indel",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_indel.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_indel_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_indel_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="consensus_calling",
        func=
        'wgs.workflows.variant_calling_consensus.create_consensus_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile("museq_germlines_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_ss_vcf),
            mgd.InputFile("museq_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_vcf),
            mgd.InputFile("strelka_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_snv_vcf),
            mgd.InputFile("strelka_indel_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_indel_vcf),
            mgd.OutputFile('somatic_csv', 'sample_id', fnames=somatic_calls),
            mgd.OutputFile('somatic_snpeff',
                           'sample_id',
                           fnames=somatic_snpeff),
            mgd.OutputFile('somatic_ma', 'sample_id', fnames=somatic_ma),
            mgd.OutputFile('somatic_ids', 'sample_id', fnames=somatic_ids),
            mgd.OutputFile('indel_csv', 'sample_id', fnames=indel_calls),
            mgd.OutputFile('indel_snpeff', 'sample_id', fnames=indel_snpeff),
            mgd.OutputFile('indel_ma', 'sample_id', fnames=indel_ma),
            mgd.OutputFile('indel_ids', 'sample_id', fnames=indel_ids),
            mgd.OutputFile('germline_csv', 'sample_id', fnames=germline_calls),
            mgd.OutputFile('germline_snpeff',
                           'sample_id',
                           fnames=germline_snpeff),
            mgd.OutputFile('germline_ma', 'sample_id', fnames=germline_ma),
            mgd.OutputFile('germline_ids', 'sample_id', fnames=germline_ids),
            refdir,
        ),
    )

    return workflow
Example #24
0
def create_destruct_wgs_workflow(tumour_bam,
                                 normal_bam,
                                 raw_breakpoints,
                                 raw_library,
                                 breakpoints,
                                 library,
                                 reads,
                                 sample_id,
                                 reference,
                                 destruct_refdata,
                                 gtf,
                                 mappability,
                                 single_node=False):

    destruct_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
        'gtf_filename': gtf
    }

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ctx=helpers.get_default_ctx(
                           docker_image=config.containers('destruct'),
                           walltime="48:00",
                       ),
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(destruct_refdata, destruct_config))

    if single_node:
        workflow.transform(
            name='destruct_local',
            ctx=helpers.get_default_ctx(walltime='120:00', disk=800),
            func='wgs.workflows.destruct_wgs.tasks.run_destruct_local',
            args=(
                mgd.TempSpace("destruct_local_temp"),
                mgd.InputFile(tumour_bam),
                mgd.InputFile(normal_bam),
                sample_id,
                mgd.TempOutputFile("raw_breakpoints"),
                mgd.TempOutputFile("raw_library"),
                mgd.OutputFile(reads),
                mgd.TempInputObj("destruct_config"),
                destruct_refdata,
            ),
            kwargs={
                'ncpus': 16,
                'docker_image': config.containers('destruct')
            })
    else:
        workflow.subworkflow(
            name='destruct_parallel',
            ctx=helpers.get_default_ctx(
                docker_image=config.containers('destruct'),
                walltime="48:00",
            ),
            # refers to seperate destruct package
            func='destruct.workflow.create_destruct_workflow',
            args=({
                sample_id: mgd.InputFile(tumour_bam),
                sample_id + 'N': mgd.InputFile(normal_bam)
            }, mgd.TempOutputFile("raw_breakpoints"),
                  mgd.TempOutputFile("raw_library"), mgd.OutputFile(reads),
                  mgd.TempInputObj("destruct_config"), destruct_refdata))

    workflow.commandline(
        name='filter_annotate_breakpoints',
        ctx=helpers.get_default_ctx(docker_image=config.containers('destruct'),
                                    memory=8,
                                    walltime='8:00'),
        args=(
            'filter_annotate_breakpoints.py',
            '--breakpoints',
            mgd.TempInputFile("raw_breakpoints"),
            '--library',
            mgd.TempInputFile("raw_library"),
            '--control_ids',
            sample_id + 'N',
            '--out_breakpoints',
            mgd.TempOutputFile("filter_annotate_breakpoints_output"),
            '--out_library',
            mgd.TempOutputFile("library"),
        ))

    workflow.transform(
        name='mappability_annotate_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func='wgs.workflows.destruct_wgs.flag_mappability.main',
        args=(
            mgd.TempInputFile("filter_annotate_breakpoints_output"),
            mgd.TempOutputFile("breakpoints"),
            mappability,
        ))

    workflow.transform(
        name='finalize_raw_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("raw_breakpoints"),
            mgd.OutputFile(raw_breakpoints, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_raw_library',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("raw_library"),
            mgd.OutputFile(raw_library, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_breakpoints',
        ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
        func="wgs.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("breakpoints"),
            mgd.OutputFile(breakpoints, extensions=['.yaml']),
        ),
    )

    workflow.transform(name='finalize_library',
                       ctx=helpers.get_default_ctx(memory=8, walltime='8:00'),
                       func="wgs.utils.csvutils.finalize_csv",
                       args=(
                           mgd.TempInputFile("library"),
                           mgd.OutputFile(library, extensions=['.yaml']),
                       ))

    return workflow
Example #25
0
def call_germlines_only(samples,
                        normals,
                        museq_ss_vcf,
                        samtools_germline_vcf,
                        roh_calls,
                        museq_single_pdf,
                        refdir,
                        single_node=False):
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])
    samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid])
                                  for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(
            mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
            mgd.OutputFile('museq_single_pdf',
                           'sample_id',
                           fnames=museq_single_pdf),
            paths_refdir['reference'],
            chromosomes,
        ),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv", 'sample_id', fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_anno.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germline_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    return workflow
Example #26
0
def create_hmmcopy_workflow(
    bam_file,
    sample_id,
    bias_pdf,
    correction_pdf,
    hmmcopy_pdf,
    hmmcopy_table,
    pygenes_table,
    chromosomes,
    map_wig,
    gc_wig,
    pygenes_gtf,
):
    cn_params = config.default_params()['copynumber_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='hmmcopy_readcounter',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='2:00',
                       ),
                       func='wgs.workflows.hmmcopy.tasks.hmmcopy_readcounter',
                       args=(
                           mgd.InputFile(bam_file, extensions=['.bai']),
                           mgd.TempOutputFile('infile.wig'),
                           chromosomes,
                           cn_params['readcounter'],
                       ))

    workflow.transform(
        name='calc_corr',
        func='wgs.workflows.hmmcopy.tasks.calc_corr',
        args=(
            mgd.TempInputFile('infile.wig'),
            mgd.TempOutputFile('infile_copy.txt'),
            mgd.TempOutputFile('infile_copy.obj'),
            gc_wig,
            map_wig,
            cn_params['map_cutoff'],
        ),
    )

    workflow.transform(
        name='run_hmmcopy',
        func='wgs.workflows.hmmcopy.tasks.run_hmmcopy',
        args=(
            mgd.TempInputFile('infile_copy.obj'),
            mgd.TempInputFile('infile_copy.txt'),
            mgd.TempOutputFile('hmmcopy_res.obj'),
            mgd.TempOutputFile('hmmcopy_segments.txt'),
            mgd.OutputFile(hmmcopy_table),
            sample_id,
            cn_params['hmmcopy_params'],
        ),
    )

    workflow.transform(
        name='plot_hmm',
        func='wgs.workflows.hmmcopy.tasks.plot_hmm',
        args=(
            mgd.TempInputFile('infile_copy.obj'),
            mgd.TempInputFile('hmmcopy_res.obj'),
            mgd.TempSpace('correction_plots_dir'),
            mgd.TempSpace('hmmcopy_plots_dir'),
            mgd.OutputFile(bias_pdf),
            mgd.OutputFile(correction_pdf),
            mgd.OutputFile(hmmcopy_pdf),
        ),
    )

    workflow.transform(name='annot_hmm',
                       func='wgs.workflows.hmmcopy.tasks.annot_hmm',
                       args=(
                           mgd.TempInputFile('hmmcopy_segments.txt'),
                           mgd.OutputFile(pygenes_table),
                           pygenes_gtf,
                       ))

    return workflow
Example #27
0
def create_mutect_workflow(normal_bam,
                           tumour_bam,
                           snv_vcf,
                           snv_maf,
                           reference,
                           reference_vep,
                           chromosomes,
                           normal_id,
                           tumour_id,
                           single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.mutect.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='mutect_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.mutect.tasks.run_mutect_one_job',
            args=(mgd.TempSpace("run_mutect_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam)),
        )
    else:
        workflow.transform(
            name='mutect_caller',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.mutect.tasks.run_mutect',
            args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam),
                  mgd.TempSpace('mutect_temp', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.workflows.mutect.tasks.merge_vcfs',
            args=(
                mgd.TempInputFile('mutect.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Example #28
0
def realign_bam_files(inputs,
                      outputs,
                      metrics_output,
                      metrics_tar,
                      refdir,
                      samples,
                      single_node=False,
                      ignore_bamtofastq_exception=False,
                      picard_mem=8):
    inputs = dict([(sample, inputs[sample]) for sample in samples])
    outputs = dict([(sample, outputs[sample]) for sample in samples])
    outputs_tdf = dict([(sample, outputs[sample] + '.tdf')
                        for sample in samples])

    metrics_output = dict([(sample, metrics_output[sample])
                           for sample in samples])
    metrics_tar = dict([(sample, metrics_tar[sample]) for sample in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.transform(name='bam_to_fastq',
                       ctx=helpers.get_default_ctx(walltime='96:00', disk=500),
                       func="wgs.workflows.realignment.tasks.split_by_rg",
                       axes=('sample_id', ),
                       args=(mgd.InputFile('input.bam',
                                           'sample_id',
                                           fnames=inputs),
                             mgd.TempOutputFile("inputdata_read1.fastq.gz",
                                                'sample_id', "readgroup"),
                             mgd.TempOutputFile("inputdata_read2.fastq.gz",
                                                'sample_id',
                                                "readgroup",
                                                axes_origin=[]),
                             mgd.TempSpace("bamtofastq", 'sample_id'),
                             ignore_bamtofastq_exception))

    workflow.transform(name='get_sample_info',
                       func="wgs.workflows.realignment.tasks.get_read_group",
                       axes=('sample_id', ),
                       ret=mgd.TempOutputObj('sample_info', 'sample_id'),
                       args=(mgd.InputFile('input.bam',
                                           'sample_id',
                                           fnames=inputs), ))

    workflow.subworkflow(name='align_samples',
                         func=alignment.align_samples,
                         args=(mgd.TempInputFile("inputdata_read1.fastq.gz",
                                                 "sample_id",
                                                 "readgroup",
                                                 axes_origin=[]),
                               mgd.TempInputFile("inputdata_read2.fastq.gz",
                                                 "sample_id",
                                                 "readgroup",
                                                 axes_origin=[]),
                               mgd.OutputFile('output.bam',
                                              'sample_id',
                                              fnames=outputs,
                                              extensions=['.bai'],
                                              axes_origin=[]),
                               mgd.OutputFile('output_metrics.csv',
                                              'sample_id',
                                              fnames=metrics_output,
                                              extensions=['.yaml'],
                                              axes_origin=[]),
                               mgd.OutputFile('output_metrics.tar',
                                              'sample_id',
                                              fnames=metrics_tar,
                                              axes_origin=[]),
                               mgd.OutputFile('output.bam.tdf',
                                              'sample_id',
                                              fnames=outputs_tdf,
                                              axes_origin=[]),
                               mgd.TempInputObj('sample_info',
                                                'sample_id',
                                                axes_origin=[]), refdir),
                         kwargs={
                             'single_node': single_node,
                             'picard_mem': picard_mem
                         })

    return workflow
Example #29
0
def create_remixt_workflow(
    tumour_path,
    normal_path,
    breakpoints,
    sample_id,
    remixt_results_filename,
    remixt_brk_cn_csv,
    remixt_cn_csv,
    remixt_minor_modes_csv,
    remixt_mix_csv,
    remixt_read_depth_csv,
    remixt_stats_csv,
    remixt_refdata,
    reference,
    single_node=False,
):
    ctx = {'docker_image': config.containers('wgs')}

    params = config.default_params('copynumber_calling')['remixt']

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    remixt_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
    }

    if breakpoints is None:
        workflow.setobj(
            obj=mgd.TempOutputObj('emptybreakpoints'),
            value=[],
        )

        workflow.transform(
            name='write_empty_breakpoints',
            func='wgs.workflows.remixt.tasks.write_empty_breakpoints',
            args=(
                mgd.TempInputObj('emptybreakpoints'),
                mgd.TempOutputFile('filtered_breakpoints.csv'),
            ),
        )

    else:
        workflow.transform(
            name='filter_breakpoints',
            func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints',
            ctx=helpers.get_default_ctx(memory=4, walltime='4:00'),
            args=(mgd.InputFile(breakpoints),
                  mgd.TempOutputFile('filtered_breakpoints.csv'),
                  params['min_num_reads']))

    if single_node:
        workflow.transform(
            name='remixt',
            func='wgs.workflows.remixt.tasks.run_remixt_local',
            ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8),
            args=(
                mgd.TempSpace("remixt_temp"),
                mgd.TempInputFile('filtered_breakpoints.csv'),
                mgd.InputFile(tumour_path, extensions=['.bai']),
                mgd.InputFile(normal_path, extensions=['.bai']),
                sample_id,
                mgd.OutputFile(remixt_results_filename),
                mgd.TempSpace('remixt_raw_dir'),
                remixt_config,
                remixt_refdata,
            ),
        )
    else:
        workflow.subworkflow(name='remixt',
                             func="remixt.workflow.create_remixt_bam_workflow",
                             ctx={
                                 'docker_image': config.containers('remixt'),
                                 'walltime': '48:00'
                             },
                             args=(
                                 mgd.TempInputFile('filtered_breakpoints.csv'),
                                 {
                                     sample_id:
                                     mgd.InputFile(tumour_path,
                                                   extensions=['.bai']),
                                     sample_id + 'N':
                                     mgd.InputFile(normal_path,
                                                   extensions=['.bai'])
                                 },
                                 {
                                     sample_id:
                                     mgd.OutputFile(remixt_results_filename)
                                 },
                                 mgd.TempSpace('remixt_raw_dir'),
                                 remixt_config,
                                 remixt_refdata,
                             ),
                             kwargs={
                                 'normal_id': sample_id + 'N',
                             })

    workflow.transform(
        name='parse_remixt',
        func='wgs.workflows.remixt.tasks.parse_remixt_file',
        args=(mgd.InputFile(remixt_results_filename), [
            mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']),
        ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth',
            '/stats'], mgd.TempSpace('tempdir_parse')))

    return workflow
Example #30
0
def create_sample_qc_workflow(
        sample_id,
        refdir,
        normal_bam,
        tumour_bam,
        titan,
        remixt,
        breakpoints_consensus,
        roh,
        germline_calls,
        somatic_calls,
        genome_wide_plot,
        normal_coverage,
        tumour_coverage,
        chromosomes,
        bins,
        mapping_qual_threshold,
        single_node=False
):

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(
        name='coverage_normal_data',
        func=get_coverage_data,
        args=(
            mgd.InputFile(normal_bam),
            mgd.OutputFile(normal_coverage),
            refdir,
            chromosomes,
            mapping_qual_threshold,
            bins,
        ),
        kwargs={'single_node': single_node}
    )

    workflow.subworkflow(
        name='coverage_tumour_data',
        func=get_coverage_data,
        args=(
            mgd.InputFile(tumour_bam),
            mgd.OutputFile(tumour_coverage),
            refdir,
            chromosomes,
            mapping_qual_threshold,
            bins,
        ),
        kwargs={'single_node': single_node}
    )


    workflow.transform(
        name='generate_genome_wide_plot',
        ctx=helpers.get_default_ctx(
            memory=10,
        ),
        func="wgs.workflows.sample_qc.tasks.genome_wide",
        args=(
            sample_id,
            mgd.InputFile(roh),
            mgd.InputFile(germline_calls),
            mgd.InputFile(normal_coverage),
            chromosomes,
            mgd.OutputFile(genome_wide_plot),
        ),
        kwargs={"titan": mgd.InputFile(titan),
            "somatic": mgd.InputFile(somatic_calls),
            "remixt": mgd.InputFile(remixt),
            "tumour": mgd.InputFile(tumour_coverage),
            "breakpoints": mgd.InputFile(breakpoints_consensus)
        }
    )

    return workflow