Beispiel #1
0
def align_sample_no_split(fastq_1,
                          fastq_2,
                          out_file,
                          samtools_flagstat,
                          sample_id,
                          lane_id,
                          sample_info,
                          refdir,
                          picard_mem=None):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='align_bwa_mem',
                       ctx=helpers.get_default_ctx(memory=8,
                                                   walltime='48:00',
                                                   ncpus='8',
                                                   disk=300),
                       func='wgs.workflows.alignment.tasks.align_bwa_mem',
                       args=(
                           pypeliner.managed.InputFile(fastq_1),
                           pypeliner.managed.InputFile(fastq_2),
                           ref_genome,
                           pypeliner.managed.TempOutputFile('aligned.bam'),
                           '8',
                           sample_info,
                       ),
                       kwargs={
                           'sample_id': sample_id,
                           'lane_id': lane_id,
                       })

    workflow.transform(name='sort',
                       ctx=helpers.get_default_ctx(memory=8,
                                                   walltime='48:00',
                                                   ncpus='8',
                                                   disk=300),
                       func='wgs.workflows.alignment.tasks.bam_sort',
                       args=(pypeliner.managed.TempInputFile('aligned.bam'),
                             pypeliner.managed.OutputFile(out_file),
                             pypeliner.managed.TempSpace('bam_sort_tempdir')),
                       kwargs={
                           'threads': '8',
                           'mem': '{}G'.format(picard_mem)
                       })

    workflow.transform(
        name='index_and_flagstat',
        func='wgs.workflows.alignment.tasks.index_and_flagstat',
        ctx=helpers.get_default_ctx(memory=4, walltime='24:00', disk=200),
        args=(pypeliner.managed.InputFile(out_file),
              pypeliner.managed.OutputFile(out_bai),
              pypeliner.managed.OutputFile(samtools_flagstat)),
    )

    return workflow
Beispiel #2
0
def breakpoint_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
    input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz')
    destruct_library = os.path.join(sv_outdir,
                                    '{sample_id}_destruct_library.csv.gz')
    destruct_raw_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz')
    destruct_raw_library = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_library.csv.gz')
    destruct_reads = os.path.join(sv_outdir,
                                  '{sample_id}_destruct_reads.csv.gz')
    lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir,
                              '{sample_id}_filtered_consensus_calls.csv.gz')

    svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf')

    single_node = args['single_node']

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='destruct',
        func=destruct_wgs.create_destruct_wgs_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile("tumour.bam",
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.OutputFile('destruct_raw_breakpoints',
                             'sample_id',
                             template=destruct_raw_breakpoints),
              mgd.OutputFile('destruct_raw_library',
                             'sample_id',
                             template=destruct_raw_library),
              mgd.OutputFile('destruct_breakpoints',
                             'sample_id',
                             template=destruct_breakpoints),
              mgd.OutputFile('destruct_library',
                             'sample_id',
                             template=destruct_library),
              mgd.OutputFile('destruct_reads',
                             'sample_id',
                             template=destruct_reads),
              mgd.InputInstance('sample_id'), refdir_paths['reference'],
              refdir_paths['refdata_destruct'], refdir_paths['gtf'],
              refdir_paths['blacklist_destruct']),
        kwargs={'single_node': single_node})

    workflow.subworkflow(
        name='lumpy',
        func=lumpy.create_lumpy_workflow,
        axes=('sample_id', ),
        args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node
        },
    )

    if args['svaba']:
        workflow.subworkflow(
            name='svaba',
            func=svaba.create_svaba_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf),
                refdir_paths['reference'],
            ),
        )

    workflow.subworkflow(
        name="consensus_calling",
        func=breakpoint_calling_consensus.create_consensus_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile('destruct_breakpoints',
                            'sample_id',
                            template=destruct_breakpoints),
              mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf),
              mgd.OutputFile('consensus_calls',
                             'sample_id',
                             template=parsed_csv,
                             extensions=['.yaml']), chromosomes),
    )

    filenames = [
        destruct_breakpoints, destruct_library, destruct_raw_breakpoints,
        destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv
    ]

    if args['svaba']:
        filenames.append(svaba_vcf)

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func=helpers.generate_and_upload_metadata,
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'breakpoint_calling'
                           }
                       })

    pyp.run(workflow)
Beispiel #3
0
def call_germlines_only(samples,
                        normals,
                        museq_ss_vcf,
                        samtools_germline_vcf,
                        roh_calls,
                        museq_single_pdf,
                        refdir,
                        single_node=False):
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])
    samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid])
                                  for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(
            mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
            mgd.OutputFile('museq_single_pdf',
                           'sample_id',
                           fnames=museq_single_pdf),
            paths_refdir['reference'],
            chromosomes,
        ),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv", 'sample_id', fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_anno.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germline_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    return workflow
Beispiel #4
0
def call_variants(samples,
                  somatic_calls,
                  somatic_snpeff,
                  somatic_ma,
                  somatic_ids,
                  indel_calls,
                  indel_snpeff,
                  indel_ma,
                  indel_ids,
                  germline_calls,
                  germline_snpeff,
                  germline_ma,
                  germline_ids,
                  tumours,
                  normals,
                  museq_vcf,
                  museq_ss_vcf,
                  samtools_germlines_vcf,
                  roh_calls,
                  strelka_snv_vcf,
                  strelka_indel_vcf,
                  museq_paired_pdf,
                  museq_single_pdf,
                  refdir,
                  single_node=False,
                  is_exome=False):
    strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid])
                            for sampid in samples])
    strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid])
                              for sampid in samples])
    museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples])
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid]) for sampid in samples])
    samtools_germlines_vcf = dict([(sampid, samtools_germlines_vcf[sampid])
                                   for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid]) for sampid in samples])

    museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid])
                             for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])

    somatic_calls = dict([(sampid, somatic_calls[sampid])
                          for sampid in samples])
    somatic_snpeff = dict([(sampid, somatic_snpeff[sampid])
                           for sampid in samples])
    somatic_ma = dict([(sampid, somatic_ma[sampid]) for sampid in samples])
    somatic_ids = dict([(sampid, somatic_ids[sampid]) for sampid in samples])

    indel_calls = dict([(sampid, indel_calls[sampid]) for sampid in samples])
    indel_snpeff = dict([(sampid, indel_snpeff[sampid]) for sampid in samples])
    indel_ma = dict([(sampid, indel_ma[sampid]) for sampid in samples])
    indel_ids = dict([(sampid, indel_ids[sampid]) for sampid in samples])

    germline_calls = dict([(sampid, germline_calls[sampid])
                           for sampid in samples])
    germline_snpeff = dict([(sampid, germline_snpeff[sampid])
                            for sampid in samples])
    germline_ma = dict([(sampid, germline_ma[sampid]) for sampid in samples])
    germline_ids = dict([(sampid, germline_ids[sampid]) for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name="mutationseq_paired",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_paired_pdf',
                             'sample_id',
                             fnames=museq_paired_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
        })

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_single_pdf',
                             'sample_id',
                             fnames=museq_single_pdf),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'tumour_bam':
            None,
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
            'germline_refdata':
            paths_refdir['germline_portrait_ref'],
            'thousand_genomes':
            paths_refdir['thousand_genomes'],
            'dbsnp':
            paths_refdir['dbsnp'],
        })

    workflow.subworkflow(
        name="samtools_germline",
        func=
        'wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id', ),
        args=(mgd.TempOutputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("roh_calls.csv.gz", 'sample_id',
                             fnames=roh_calls),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]), paths_refdir['reference'],
              chromosomes),
        kwargs={
            'single_node': single_node,
        })

    workflow.subworkflow(
        name="strelka",
        func='wgs.workflows.strelka.create_strelka_workflow',
        axes=('sample_id', ),
        args=(mgd.InputFile('normal_bam',
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai']),
              mgd.InputFile('tumour_bam',
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai']),
              mgd.TempOutputFile('strelka_indel.vcf.gz', 'sample_id'),
              mgd.TempOutputFile('strelka_snv.vcf.gz', 'sample_id'),
              paths_refdir['reference'], chromosomes),
        kwargs={
            'single_node': single_node,
            'is_exome': is_exome
        },
    )

    workflow.subworkflow(
        name="annotate_paired_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_vcf), paths_refdir['snpeff_config'],
              paths_refdir['mutation_assessor'], paths_refdir['dbsnp'],
              paths_refdir['thousand_genomes'], paths_refdir['cosmic'],
              paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_museq",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("museq_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile('museq_germlines_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=museq_ss_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_germline_samtools",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("samtools_germlines.vcf.gz", 'sample_id'),
              mgd.OutputFile("samtools_germlines_ann.vcf.gz",
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=samtools_germlines_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_snv.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_snv_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_snv_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="annotate_strelka_indel",
        func='wgs.workflows.vcf_annotation.create_annotation_workflow',
        axes=('sample_id', ),
        args=(mgd.TempInputFile("strelka_indel.vcf.gz", 'sample_id'),
              mgd.OutputFile('strelka_indel_ann.vcf.gz',
                             'sample_id',
                             extensions=['.csi', '.tbi'],
                             fnames=strelka_indel_vcf),
              paths_refdir['snpeff_config'], paths_refdir['mutation_assessor'],
              paths_refdir['dbsnp'], paths_refdir['thousand_genomes'],
              paths_refdir['cosmic'], paths_refdir['blacklist']),
        kwargs={
            'vcftools_docker': config.containers('vcftools'),
            'snpeff_docker': config.containers('vcftools'),
        })

    workflow.subworkflow(
        name="consensus_calling",
        func=
        'wgs.workflows.variant_calling_consensus.create_consensus_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile("museq_germlines_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_ss_vcf),
            mgd.InputFile("museq_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=museq_vcf),
            mgd.InputFile("strelka_snv_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_snv_vcf),
            mgd.InputFile("strelka_indel_ann.vcf.gz",
                          'sample_id',
                          fnames=strelka_indel_vcf),
            mgd.OutputFile('somatic_csv', 'sample_id', fnames=somatic_calls),
            mgd.OutputFile('somatic_snpeff',
                           'sample_id',
                           fnames=somatic_snpeff),
            mgd.OutputFile('somatic_ma', 'sample_id', fnames=somatic_ma),
            mgd.OutputFile('somatic_ids', 'sample_id', fnames=somatic_ids),
            mgd.OutputFile('indel_csv', 'sample_id', fnames=indel_calls),
            mgd.OutputFile('indel_snpeff', 'sample_id', fnames=indel_snpeff),
            mgd.OutputFile('indel_ma', 'sample_id', fnames=indel_ma),
            mgd.OutputFile('indel_ids', 'sample_id', fnames=indel_ids),
            mgd.OutputFile('germline_csv', 'sample_id', fnames=germline_calls),
            mgd.OutputFile('germline_snpeff',
                           'sample_id',
                           fnames=germline_snpeff),
            mgd.OutputFile('germline_ma', 'sample_id', fnames=germline_ma),
            mgd.OutputFile('germline_ids', 'sample_id', fnames=germline_ids),
            refdir,
        ),
    )

    return workflow
Beispiel #5
0
def create_somatic_calling_workflow(samples,
                                    tumours,
                                    normals,
                                    museq_vcf,
                                    museq_maf,
                                    museq_paired_pdf,
                                    strelka_snv_vcf,
                                    strelka_snv_maf,
                                    strelka_indel_vcf,
                                    strelka_indel_maf,
                                    mutect_vcf,
                                    mutect_maf,
                                    somatic_consensus_maf,
                                    refdir,
                                    normal_ids,
                                    tumour_ids,
                                    single_node=False,
                                    is_exome=False):
    strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid])
                            for sampid in samples])
    strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid])
                              for sampid in samples])
    strelka_snv_maf = dict([(sampid, strelka_snv_maf[sampid])
                            for sampid in samples])
    strelka_indel_maf = dict([(sampid, strelka_indel_maf[sampid])
                              for sampid in samples])

    museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples])
    museq_maf = dict([(sampid, museq_maf[sampid]) for sampid in samples])
    museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid])
                             for sampid in samples])

    mutect_vcf = dict([(sampid, mutect_vcf[sampid]) for sampid in samples])
    mutect_maf = dict([(sampid, mutect_maf[sampid]) for sampid in samples])

    somatic_consensus_maf = dict([(sampid, somatic_consensus_maf[sampid])
                                  for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.setobj(obj=mgd.TempOutputObj('normal_id',
                                          'sample_id',
                                          axes_origin=[]),
                    value={v: normal_ids[v]
                           for v in samples})

    workflow.setobj(obj=mgd.TempOutputObj('tumour_id',
                                          'sample_id',
                                          axes_origin=[]),
                    value={v: tumour_ids[v]
                           for v in samples})

    workflow.subworkflow(
        name="mutationseq_paired",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(
            mgd.OutputFile('museq_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=museq_vcf),
            mgd.OutputFile('museq_snv_ann.maf', 'sample_id', fnames=museq_maf),
            mgd.OutputFile('museq_paired_pdf',
                           'sample_id',
                           fnames=museq_paired_pdf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
        ),
        kwargs={
            'normal_id':
            mgd.TempInputObj('normal_id', 'sample_id'),
            'tumour_id':
            mgd.TempInputObj('tumour_id', 'sample_id'),
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
        })

    workflow.subworkflow(
        name="strelka",
        func='wgs.workflows.strelka.create_strelka_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('normal_bam',
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai']),
            mgd.InputFile('tumour_bam',
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai']),
            mgd.OutputFile('strelka_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=strelka_snv_vcf),
            mgd.OutputFile('strelka_snv_ann.maf',
                           'sample_id',
                           fnames=strelka_snv_maf),
            mgd.OutputFile('strelka_indel_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=strelka_indel_vcf),
            mgd.OutputFile('strelka_indel_ann.maf',
                           'sample_id',
                           fnames=strelka_indel_maf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
            'is_exome': is_exome
        },
    )

    workflow.subworkflow(
        name="mutect",
        func='wgs.workflows.mutect.create_mutect_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('normal_bam',
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai']),
            mgd.InputFile('tumour_bam',
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai']),
            mgd.OutputFile('mutect_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=mutect_vcf),
            mgd.OutputFile('mutect_snv_ann.maf',
                           'sample_id',
                           fnames=mutect_maf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
        },
    )

    workflow.subworkflow(
        name="somatic_consensus",
        func=
        'wgs.workflows.somatic_calling_consensus.create_somatic_consensus_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('mutect_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=mutect_vcf),
            mgd.InputFile('strelka_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=strelka_snv_vcf),
            mgd.InputFile('strelka_indel_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=strelka_indel_vcf),
            mgd.InputFile('museq_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=museq_vcf),
            mgd.OutputFile("somatic_consensus.maf",
                           'sample_id',
                           fnames=somatic_consensus_maf),
            chromosomes,
            paths_refdir['reference_vep'],
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
    )

    return workflow
Beispiel #6
0
def create_consensus_workflow(museq_germline, museq_snv, strelka_snv,
                              strelka_indel, somatic_calls, somatic_snpeff,
                              somatic_ma, somatic_ids, indel_calls,
                              indel_snpeff, indel_ma, indel_ids,
                              germline_calls, germline_snpeff, germline_ma,
                              germline_ids, refdir):
    params = config.default_params('variant_calling')
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_museq_germlines',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_germline, extensions=['.csi', '.tbi']),
              mgd.OutputFile(germline_calls, extensions=['.yaml']),
              mgd.OutputFile(germline_snpeff, extensions=['.yaml']),
              mgd.OutputFile(germline_ma, extensions=['.yaml']),
              mgd.OutputFile(germline_ids,
                             extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_germlines")),
    )

    workflow.transform(
        name='parse_strelka_indel',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_indel, extensions=['.csi', '.tbi']),
              mgd.OutputFile(indel_calls, extensions=['.yaml']),
              mgd.OutputFile(indel_snpeff, extensions=['.yaml']),
              mgd.OutputFile(indel_ma, extensions=['.yaml']),
              mgd.OutputFile(indel_ids,
                             extensions=['.yaml']), params["parse_strelka"],
              chromosomes, mgd.TempSpace("tempdir_strelka_indel")),
    )

    workflow.transform(
        name='parse_museq_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(museq_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('museq_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_snpeff.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('museq_ids.csv',
                                 extensions=['.yaml']), params["parse_museq"],
              chromosomes, mgd.TempSpace("tempdir_parse_museq_snv")),
    )

    workflow.transform(
        name='parse_strelka_snv',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.parse_vcf',
        args=(mgd.InputFile(strelka_snv, extensions=['.csi', '.tbi']),
              mgd.TempOutputFile('strelka_snv.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_snpeff.csv',
                                 extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ma.csv', extensions=['.yaml']),
              mgd.TempOutputFile('strelka_snv_ids.csv', extensions=['.yaml']),
              params["parse_strelka"], chromosomes,
              mgd.TempSpace("tempdir_parse_strelka_snv")),
    )

    workflow.transform(
        name='merge_snvs',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_snv.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_calls, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_snpeff',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_snpeff.csv',
                                  extensions=['.yaml']),
                mgd.TempInputFile('museq_snpeff.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_snpeff, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ma',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ma.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ma.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ma, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    workflow.transform(
        name='merge_ids',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.variant_calling_consensus.tasks.merge_overlap',
        args=(
            [
                mgd.TempInputFile('strelka_snv_ids.csv', extensions=['.yaml']),
                mgd.TempInputFile('museq_ids.csv', extensions=['.yaml'])
            ],
            mgd.OutputFile(somatic_ids, extensions=['.yaml']),
        ),
        kwargs={'on': ['chrom', 'pos']})

    return workflow
Beispiel #7
0
def get_coverage_data(
        input_bam, output, refdir, chromosomes,
        mapping_qual, bins, single_node=False
):
    reference = config.refdir_data(refdir)['paths']['reference']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,

            ),
        )

    else:
        workflow.setobj(
            obj=mgd.OutputChunks('chromosome'),
            value=chromosomes
        )
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,
            ),
        )

        workflow.transform(
            name='merge_data',
            func='wgs.utils.csvutils.concatenate_csv',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]),
                mgd.OutputFile(output),
            )
        )

    return workflow
Beispiel #8
0
def collect_bam_metrics(bam,
                        markdups_metrics,
                        sample_id,
                        refdir,
                        metrics,
                        picard_insert_metrics,
                        picard_insert_pdf,
                        flagstat_metrics,
                        picard_gc_metrics,
                        picard_gc_summary,
                        picard_gc_pdf,
                        picard_wgs_metrics,
                        bam_tdf,
                        picard_mem=8):
    '''
    calculates bam metrics in bams
    1. picard insert metrics
    2. picard GC metrics
    3. picard wgs metrics
    4. fastqc metrics

    :param config: config
    images for metrics
    :param bams: sample:bam dictionary
    :param metrics_csv: output csv containing
        metrics
    :param single_node:
    '''

    ref_genome = config.refdir_data(refdir)['paths']['reference']

    picard_wgs_params = config.default_params('alignment')['picard_wgs_params']

    reftype = config.refdir_data(refdir)['params']['reference_type']

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="calc_picard_insert_metrics",
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        func='wgs.workflows.alignment.tasks.bam_collect_insert_metrics',
        args=(
            mgd.InputFile(bam),
            mgd.OutputFile(flagstat_metrics),
            mgd.OutputFile(picard_insert_metrics),
            mgd.OutputFile(picard_insert_pdf),
            mgd.TempSpace('picard_insert'),
        ),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_gc_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_gc_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_gc_metrics),
              mgd.OutputFile(picard_gc_summary), mgd.OutputFile(picard_gc_pdf),
              mgd.TempSpace('picard_gc')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name="calc_picard_wgs_metrics",
        func='wgs.workflows.alignment.tasks.bam_collect_wgs_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='72:00', disk=400),
        args=(mgd.InputFile(bam), ref_genome,
              mgd.OutputFile(picard_wgs_metrics), picard_wgs_params,
              mgd.TempSpace('picard_wgs')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='igvtools_tdf',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.get_igvtools_count',
        args=(pypeliner.managed.InputFile(bam),
              pypeliner.managed.OutputFile(bam_tdf), reftype),
    )

    workflow.transform(
        name='collect_metrics',
        func='wgs.workflows.alignment.tasks.bam_collect_all_metrics',
        ctx=helpers.get_default_ctx(memory=10, walltime='4:00', disk=400),
        args=(mgd.InputFile(flagstat_metrics),
              mgd.InputFile(picard_insert_metrics),
              mgd.InputFile(picard_wgs_metrics),
              mgd.InputFile(markdups_metrics),
              mgd.OutputFile(metrics, extensions=['.yaml']), sample_id),
        kwargs={
            'main_dtypes': dtypes()['metrics'],
            'insert_dtypes': dtypes()['insert_metrics']
        })

    return workflow
Beispiel #9
0
def align_sample_split(fastq_1,
                       fastq_2,
                       out_file,
                       samtools_flagstat,
                       sample_id,
                       lane_id,
                       sample_info,
                       refdir,
                       picard_mem=2):
    ref_genome = config.refdir_data(refdir)['paths']['reference']

    split_size = config.default_params('alignment')['split_size']

    out_bai = out_file + '.bai'

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_fastq_1',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_1),
            pypeliner.managed.TempOutputFile('read_1', 'split'),
            split_size,
        ),
    )

    workflow.transform(
        name='split_fastq_2',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='24:00',
        ),
        func='biowrappers.components.io.fastq.tasks.split_fastq',
        args=(
            pypeliner.managed.InputFile(fastq_2),
            pypeliner.managed.TempOutputFile('read_2', 'split',
                                             axes_origin=[]),
            split_size,
        ),
    )

    workflow.transform(name='align_bwa_mem',
                       axes=('split', ),
                       ctx=helpers.get_default_ctx(
                           memory=8,
                           walltime='16:00',
                           ncpus=8,
                       ),
                       func='wgs.workflows.alignment.tasks.align_bwa_mem',
                       args=(
                           pypeliner.managed.TempInputFile('read_1', 'split'),
                           pypeliner.managed.TempInputFile('read_2', 'split'),
                           ref_genome,
                           pypeliner.managed.TempOutputFile(
                               'aligned.bam', 'split'),
                           '8',
                           sample_info,
                       ),
                       kwargs={
                           'sample_id': sample_id,
                           'lane_id': lane_id,
                       })

    workflow.transform(
        name='sort',
        axes=('split', ),
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        func='wgs.workflows.alignment.tasks.bam_sort',
        args=(pypeliner.managed.TempInputFile('aligned.bam', 'split'),
              pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
              pypeliner.managed.TempSpace('bam_sort_by_split', 'split')),
        kwargs={'mem': '{}G'.format(picard_mem)})

    workflow.transform(
        name='merge',
        ctx=helpers.get_default_ctx(
            memory=8,
            walltime='72:00',
        ),
        func="wgs.workflows.alignment.tasks.merge_bams",
        args=(pypeliner.managed.TempInputFile('sorted.bam', 'split'),
              pypeliner.managed.OutputFile(out_file),
              pypeliner.managed.TempSpace('bam_merge_by_split')),
        kwargs={'mem': picard_mem})

    workflow.commandline(
        name='index',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'index', pypeliner.managed.InputFile(out_file),
              pypeliner.managed.OutputFile(out_bai)),
    )

    workflow.commandline(
        name='flagstat',
        ctx=helpers.get_default_ctx(
            memory=4,
            walltime='16:00',
        ),
        args=('samtools', 'flagstat', pypeliner.managed.InputFile(out_file),
              '>', pypeliner.managed.OutputFile(samtools_flagstat)),
    )

    return workflow
Beispiel #10
0
def create_postprocessing_workflow(normal_bam,
                                   tumour_bam,
                                   titan,
                                   remixt,
                                   breakpoints_consensus,
                                   roh,
                                   germline_calls,
                                   somatic_calls,
                                   circos_plot_remixt,
                                   circos_plot_titan,
                                   genome_wide_plot,
                                   refdir,
                                   sample_id,
                                   single_node=False):

    refdir_paths = config.refdir_data(refdir)['paths']
    refdir_params = config.refdir_data(refdir)['params']

    ideogram = refdir_paths["ideogram"]

    titan_calls = titan[sample_id]
    remixt_calls = remixt[sample_id]
    sv_calls = breakpoints_consensus[sample_id]
    roh_calls = roh[sample_id]
    germline_vcf = germline_calls[sample_id]
    somatic_calls = somatic_calls[sample_id]
    chromosomes = refdir_params['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.subworkflow(name='coverage_normal_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(normal_bam),
                             mgd.TempOutputFile('normal_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.subworkflow(name='coverage_tumour_data',
                         func=get_coverage_data,
                         args=(
                             mgd.InputFile(tumour_bam),
                             mgd.TempOutputFile('tumour_coverage'),
                             refdir,
                         ),
                         kwargs={'single_node': single_node})

    workflow.transform(
        name='parse_roh',
        ctx=helpers.get_default_ctx(memory=5),
        func="wgs.workflows.postprocessing.tasks.parse_roh",
        args=(
            mgd.InputFile(roh_calls),
            mgd.TempOutputFile("ROH_parsed"),
        ),
    )

    if remixt_calls:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
            kwargs={
                "remixt": mgd.InputFile(remixt_calls),
                "remixt_label": sample_id
            })
        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={
                'docker_image': config.containers('circos'),
                'remixt_calls': mgd.InputFile(remixt_calls)
            },
        )
    else:

        workflow.transform(
            name='generate_genome_wide_plot',
            ctx=helpers.get_default_ctx(memory=10, ),
            func="wgs.workflows.postprocessing.tasks.genome_wide",
            args=(
                mgd.InputFile(titan_calls),
                mgd.TempInputFile("ROH_parsed"),
                mgd.InputFile(germline_vcf),
                mgd.InputFile(somatic_calls),
                mgd.TempInputFile('tumour_coverage'),
                mgd.TempInputFile('normal_coverage'),
                mgd.InputFile(sv_calls),
                mgd.InputFile(ideogram),
                chromosomes,
                mgd.OutputFile(genome_wide_plot),
            ),
        )

        workflow.transform(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10),
            func="wgs.workflows.postprocessing.tasks.circos",
            args=(
                mgd.InputFile(titan_calls),
                sample_id,
                mgd.InputFile(sv_calls),
                mgd.TempOutputFile(circos_plot_remixt),
                mgd.TempOutputFile(circos_plot_titan),
                mgd.TempSpace('circos'),
            ),
            kwargs={'docker_image': config.containers('circos')})

    return workflow
Beispiel #11
0
def get_coverage_data(input_bam, output, refdir, single_node=False):
    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    chrom_sizes = config.refdir_data(refdir)['paths']['chrom_sizes']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
            ),
            kwargs={'docker_image': config.containers('samtools')},
        )

    else:

        workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.postprocessing.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                mgd.InputFile(chrom_sizes),
            ))
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.postprocessing.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(memory=5),
            axes=('chromosome', ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                # mgd.InputInstance('chromosome'),
                # refdir_paths['reference'],
            ),
            kwargs={'docker_image': config.containers('samtools')})
        workflow.transform(name='merge_data',
                           func='wgs.utils.csvutils.concatenate_csv',
                           ctx=helpers.get_default_ctx(memory=5),
                           args=(
                               mgd.TempInputFile('per_interval.txt',
                                                 'chromosome',
                                                 axes_origin=[]),
                               mgd.OutputFile(output),
                           ))

    return workflow
Beispiel #12
0
def single_sample_copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    bams = helpers.get_values_from_input(inputs, 'bam')
    samples = list(bams.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    hmmcopy_raw_dir = os.path.join(cna_outdir, 'hmmcopy')
    bias_pdf = os.path.join(hmmcopy_raw_dir, 'plots', '{sample_id}_bias.pdf')
    correction_pdf = os.path.join(hmmcopy_raw_dir, 'plots',
                                  '{sample_id}_correction.pdf')
    hmmcopy_pdf = os.path.join(hmmcopy_raw_dir, 'plots',
                               '{sample_id}_hmmcopy.pdf')
    correction_table = os.path.join(hmmcopy_raw_dir,
                                    '{sample_id}_correctreads_with_state.txt')
    pygenes = os.path.join(hmmcopy_raw_dir, '{sample_id}_hmmcopy.seg.pygenes')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='hmmcopy',
        func=hmmcopy.create_hmmcopy_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile("sample.bam",
                            'sample_id',
                            fnames=bams,
                            extensions=['.bai'
                                        ]), mgd.InputInstance('sample_id'),
              mgd.OutputFile('bias', 'sample_id', template=bias_pdf),
              mgd.OutputFile('correction',
                             'sample_id',
                             template=correction_pdf),
              mgd.OutputFile('hmmcopy', 'sample_id', template=hmmcopy_pdf),
              mgd.OutputFile('correction_table',
                             'sample_id',
                             template=correction_table),
              mgd.OutputFile('pygenes', 'sample_id', template=pygenes),
              chromosomes, refdir_paths['map_wig'], refdir_paths['gc_wig'],
              refdir_paths['gtf']),
    )

    filenames = [
        bias_pdf,
        correction_pdf,
        hmmcopy_pdf,
        correction_table,
        pygenes,
    ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'single_sample_copynumber_calling'
                           }
                       })

    pyp.run(workflow)
Beispiel #13
0
def sample_qc_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])
    normal_only = args['normal_only']
    samples = list(inputs.keys())

    # inputs
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']
    files = make_inputs(inputs, normal_only=normal_only)

    # outputs
    out_dir = args['out_dir']
    normal_coverage = os.path.join(out_dir, '{sample_id}',
                                   '{sample_id}_normal_coverage.tsv')
    genome_wide_plot = os.path.join(out_dir, '{sample_id}',
                                    '{sample_id}_genome_wide.pdf')

    if not normal_only:
        circos_plot_remixt = os.path.join(out_dir, '{sample_id}',
                                          '{sample_id}_circos_remixt.pdf')
        circos_plot_titan = os.path.join(out_dir, '{sample_id}',
                                         '{sample_id}_circos_titan.pdf')
        tumour_coverage = os.path.join(out_dir, '{sample_id}',
                                       '{sample_id}_tumour_coverage.tsv')

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    if normal_only:
        workflow.subworkflow(
            name="normal_sample_qc",
            func=sample_qc.create_sample_qc_workflow_normal_only,
            ctx=helpers.get_default_ctx(),
            axes=('sample_id', ),
            args=(mgd.InputInstance('sample_id'), args["refdir"],
                  mgd.InputFile('normal.bam',
                                'sample_id',
                                fnames=files["normal"]),
                  mgd.InputFile('roh', 'sample_id', fnames=files["roh"]),
                  mgd.InputFile('germline_calls',
                                'sample_id',
                                fnames=files["germline"]),
                  mgd.OutputFile('genome_wide_plot.pdf',
                                 'sample_id',
                                 template=genome_wide_plot),
                  mgd.OutputFile('normcov',
                                 'sample_id',
                                 template=normal_coverage), chromosomes,
                  args['bins'], args['mapping_qual_threshold']),
            # kwargs={'single_node': args['single_node']}
        )
        outputted_filenames = helpers.expand_list(
            [normal_coverage, genome_wide_plot], samples, "sample_id")
    else:
        workflow.subworkflow(
            name="sample_qc",
            func=sample_qc.create_sample_qc_workflow,
            ctx=helpers.get_default_ctx(),
            axes=('sample_id', ),
            args=(mgd.InputInstance('sample_id'), args["refdir"],
                  mgd.InputFile('normal.bam',
                                'sample_id',
                                fnames=files["normal"]),
                  mgd.InputFile('tumour.bam',
                                'sample_id',
                                fnames=files["tumor"]),
                  mgd.InputFile('titan', 'sample_id', fnames=files["titan"]),
                  mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]),
                  mgd.InputFile('breakpoints_consensus',
                                'sample_id',
                                fnames=files["breakpoints"]),
                  mgd.InputFile('roh', 'sample_id', fnames=files["roh"]),
                  mgd.InputFile('germline_calls',
                                'sample_id',
                                fnames=files["germline"]),
                  mgd.InputFile('somatic_calls',
                                'sample_id',
                                fnames=files["somatic"]),
                  mgd.OutputFile('genome_wide_plot.pdf',
                                 'sample_id',
                                 template=genome_wide_plot),
                  mgd.OutputFile('normcov',
                                 'sample_id',
                                 template=normal_coverage),
                  mgd.OutputFile('tumcov',
                                 'sample_id',
                                 template=tumour_coverage), chromosomes,
                  args['bins'], args['mapping_qual_threshold']),
            kwargs={'single_node': args['single_node']})
        workflow.subworkflow(
            name='generate_circos_plot',
            ctx=helpers.get_default_ctx(memory=10, walltime='24:00', disk=400),
            axes=('sample_id', ),
            func=sample_qc.circos_plot,
            args=(
                mgd.InputFile('titan', 'sample_id', fnames=files["titan"]),
                mgd.InputFile('remixt', 'sample_id', fnames=files["remixt"]),
                mgd.InputInstance("sample_id"),
                mgd.InputFile('breakpoints_consensus',
                              'sample_id',
                              fnames=files["breakpoints"]),
                mgd.OutputFile('circos_remixt',
                               'sample_id',
                               template=circos_plot_remixt),
                mgd.OutputFile('circos_titan',
                               'sample_id',
                               template=circos_plot_titan),
            ),
        )
        outputted_filenames = helpers.expand_list([
            circos_plot_remixt, circos_plot_titan, normal_coverage,
            tumour_coverage, genome_wide_plot
        ], samples, "sample_id")

    meta_yaml = os.path.join(out_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(out_dir, 'input.yaml')

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'sample_qc'
                           }
                       })

    pyp.run(workflow)
Beispiel #14
0
def copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    run_hmmcopy = args['hmmcopy']
    run_titan = args['titan']
    run_remixt = args['remixt']

    if not run_hmmcopy and not run_titan and not run_remixt:
        run_hmmcopy = True
        run_titan = True
        run_remixt = True

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = list(tumours.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')

    titan_outfile = os.path.join(titan_raw_dir,
                                 '{sample_id}_titan_markers.csv.gz')
    titan_params = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_params.csv.gz')
    titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz')
    titan_igv_segs = os.path.join(titan_raw_dir,
                                  '{sample_id}_titan_igv_segs.seg')
    titan_parsed = os.path.join(titan_raw_dir,
                                '{sample_id}_titan_parsed.csv.gz')
    titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf')
    titan_tar_outputs = os.path.join(titan_raw_dir,
                                     '{sample_id}_data_all_parameters.tar.gz')
    museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf')

    hmmcopy_normal_raw_dir = os.path.join(cna_outdir, 'hmmcopy_normal')
    normal_bias_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    normal_correction_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    normal_hmmcopy_pdf = os.path.join(hmmcopy_normal_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    normal_correction_table = os.path.join(
        hmmcopy_normal_raw_dir, '{sample_id}_correctreads_with_state.txt')
    normal_pygenes = os.path.join(hmmcopy_normal_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    hmmcopy_tumour_raw_dir = os.path.join(cna_outdir, 'hmmcopy_tumour')
    tumour_bias_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                   '{sample_id}_bias.pdf')
    tumour_correction_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                         '{sample_id}_correction.pdf')
    tumour_hmmcopy_pdf = os.path.join(hmmcopy_tumour_raw_dir, 'plots',
                                      '{sample_id}_hmmcopy.pdf')
    tumour_correction_table = os.path.join(
        hmmcopy_tumour_raw_dir, '{sample_id}_correctreads_with_state.txt')
    tumour_pygenes = os.path.join(hmmcopy_tumour_raw_dir,
                                  '{sample_id}_hmmcopy.seg.pygenes')

    remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}')
    remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5')

    remixt_brk_cn_csv = os.path.join(remixt_outdir,
                                     '{sample_id}_remixt_brk_cn.csv.gz')
    remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz')
    remixt_minor_modes_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz')
    remixt_mix_csv = os.path.join(remixt_outdir,
                                  '{sample_id}_remixt_mix.csv.gz')
    remixt_read_depth_csv = os.path.join(
        remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz')
    remixt_stats_csv = os.path.join(remixt_outdir,
                                    '{sample_id}_remixt_stats.csv.gz')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('wgs')))

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    if run_remixt:
        workflow.subworkflow(
            name='remixt',
            func=remixt.create_remixt_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai']),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai']),
                mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints),
                mgd.InputInstance('sample_id'),
                mgd.OutputFile('remixt.h5',
                               'sample_id',
                               template=remixt_outfile),
                mgd.OutputFile('remixt_brk_cn.csv',
                               'sample_id',
                               template=remixt_brk_cn_csv),
                mgd.OutputFile('remixt_cn.csv',
                               'sample_id',
                               template=remixt_cn_csv),
                mgd.OutputFile('remixt_minor_modes.csv',
                               'sample_id',
                               template=remixt_minor_modes_csv),
                mgd.OutputFile('remixt_mix.csv',
                               'sample_id',
                               template=remixt_mix_csv),
                mgd.OutputFile('remixt_read_depth.csv',
                               'sample_id',
                               template=remixt_read_depth_csv),
                mgd.OutputFile('remixt_stats.csv',
                               'sample_id',
                               template=remixt_stats_csv),
                refdir_paths['refdata_remixt'],
                refdir_paths['reference'],
            ),
            kwargs={'single_node': args['single_node']})

    if run_titan:
        workflow.subworkflow(name='titan',
                             func=titan.create_titan_workflow,
                             axes=('sample_id', ),
                             args=(
                                 mgd.InputFile("tumour.bam",
                                               'sample_id',
                                               fnames=tumours,
                                               extensions=['.bai']),
                                 mgd.InputFile("normal.bam",
                                               'sample_id',
                                               fnames=normals,
                                               extensions=['.bai']),
                                 mgd.InputFile("target_list",
                                               'sample_id',
                                               fnames=targets),
                                 mgd.OutputFile('outfile',
                                                'sample_id',
                                                template=titan_outfile),
                                 mgd.OutputFile('params',
                                                'sample_id',
                                                template=titan_params),
                                 mgd.OutputFile('segs',
                                                'sample_id',
                                                template=titan_segs),
                                 mgd.OutputFile('igv_segs',
                                                'sample_id',
                                                template=titan_igv_segs),
                                 mgd.OutputFile('parsed',
                                                'sample_id',
                                                template=titan_parsed),
                                 mgd.OutputFile('plots',
                                                'sample_id',
                                                template=titan_plots),
                                 mgd.OutputFile('tar_outputs',
                                                'sample_id',
                                                template=titan_tar_outputs),
                                 mgd.OutputFile('museq.vcf',
                                                'sample_id',
                                                template=museq_vcf),
                                 mgd.InputInstance('sample_id'),
                                 refdir_paths['reference'],
                                 chromosomes,
                                 refdir_paths['het_positions_titan'],
                                 refdir_paths['map_wig'],
                                 refdir_paths['gc_wig'],
                                 refdir_paths['gtf'],
                             ),
                             kwargs={'single_node': args['single_node']})

    if run_hmmcopy:
        workflow.subworkflow(
            name='hmmcopy_normal',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("normal.bam",
                                'sample_id',
                                fnames=normals,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('normal_bias',
                                 'sample_id',
                                 template=normal_bias_pdf),
                  mgd.OutputFile('normal_correction',
                                 'sample_id',
                                 template=normal_correction_pdf),
                  mgd.OutputFile('normal_hmmcopy',
                                 'sample_id',
                                 template=normal_hmmcopy_pdf),
                  mgd.OutputFile('normal_correction_table',
                                 'sample_id',
                                 template=normal_correction_table),
                  mgd.OutputFile('normal_pygenes',
                                 'sample_id',
                                 template=normal_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

        workflow.subworkflow(
            name='hmmcopy_tumour',
            func=hmmcopy.create_hmmcopy_workflow,
            axes=('sample_id', ),
            args=(mgd.InputFile("tumour.bam",
                                'sample_id',
                                fnames=tumours,
                                extensions=['.bai']),
                  mgd.InputInstance('sample_id'),
                  mgd.OutputFile('tumour_bias',
                                 'sample_id',
                                 template=tumour_bias_pdf),
                  mgd.OutputFile('tumour_correction',
                                 'sample_id',
                                 template=tumour_correction_pdf),
                  mgd.OutputFile('tumour_hmmcopy',
                                 'sample_id',
                                 template=tumour_hmmcopy_pdf),
                  mgd.OutputFile('tumour_correction_table',
                                 'sample_id',
                                 template=tumour_correction_table),
                  mgd.OutputFile('tumour_pygenes',
                                 'sample_id',
                                 template=tumour_pygenes), chromosomes,
                  refdir_paths['map_wig'], refdir_paths['gc_wig'],
                  refdir_paths['gtf']),
        )

    filenames = []
    if run_titan:
        filenames += [
            titan_outfile,
            titan_params,
            titan_segs,
            titan_igv_segs,
            titan_parsed,
            titan_plots,
            titan_tar_outputs,
            museq_vcf,
        ]
    if run_hmmcopy:
        filenames += [
            normal_bias_pdf, normal_correction_pdf, normal_hmmcopy_pdf,
            normal_correction_table, normal_pygenes, tumour_bias_pdf,
            tumour_correction_pdf, tumour_hmmcopy_pdf, tumour_correction_table,
            tumour_pygenes
        ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'copynumber_calling'
                           }
                       })

    pyp.run(workflow)
Beispiel #15
0
def create_germline_calling_workflow(
        samples,
        normals,
        museq_ss_vcf,
        museq_ss_maf,
        museq_single_pdf,
        samtools_germline_vcf,
        samtools_germline_maf,
        roh_calls,
        freebayes_germline_vcf,
        freebayes_germline_maf,
        rtg_germline_vcf,
        rtg_germline_maf,
        consensus_germline_maf,
        refdir,
        normal_ids,
        single_node=False
):
    museq_ss_vcf = dict([(sampid, museq_ss_vcf[sampid])
                         for sampid in samples])
    museq_ss_maf = dict([(sampid, museq_ss_maf[sampid])
                         for sampid in samples])
    museq_single_pdf = dict([(sampid, museq_single_pdf[sampid])
                             for sampid in samples])

    samtools_germline_vcf = dict([(sampid, samtools_germline_vcf[sampid])
                                  for sampid in samples])
    samtools_germline_maf = dict([(sampid, samtools_germline_maf[sampid])
                                  for sampid in samples])
    roh_calls = dict([(sampid, roh_calls[sampid])
                      for sampid in samples])

    freebayes_germline_vcf = dict([(sampid, freebayes_germline_vcf[sampid])
                                   for sampid in samples])
    freebayes_germline_maf = dict([(sampid, freebayes_germline_maf[sampid])
                                   for sampid in samples])

    rtg_germline_vcf = dict([(sampid, rtg_germline_vcf[sampid])
                             for sampid in samples])
    rtg_germline_maf = dict([(sampid, rtg_germline_maf[sampid])
                             for sampid in samples])

    consensus_germline_maf = dict([(sampid, consensus_germline_maf[sampid])
                                   for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.setobj(
        obj=mgd.TempOutputObj('normal_id', 'sample_id', axes_origin=[]),
        value={v: normal_ids[v] for v in samples})

    workflow.subworkflow(
        name="mutationseq_single",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id',),
        args=(
            mgd.OutputFile(
                'museq_germlines.vcf.gz', 'sample_id',
                extensions=['.csi', '.tbi'],
                fnames=museq_ss_vcf
            ),
            mgd.OutputFile(
                'museq_germlines.maf', 'sample_id',
                fnames=museq_ss_maf
            ),
            mgd.OutputFile('museq_single_pdf', 'sample_id', fnames=museq_single_pdf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
        ),
        kwargs={
            'tumour_id': None,
            'normal_id': mgd.TempInputObj('normal_id', 'sample_id'),
            'tumour_bam': None,
            'normal_bam': mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                                        extensions=['.bai'], axes_origin=[]),
            'single_node': single_node,
            'germline_refdata': paths_refdir['germline_portrait_ref'],
            'thousand_genomes': paths_refdir['thousand_genomes'],
            'dbsnp': paths_refdir['dbsnp'],
        }
    )

    workflow.subworkflow(
        name="samtools_germline",
        func='wgs.workflows.samtools_germline.create_samtools_germline_workflow',
        axes=('sample_id',),
        args=(
            mgd.OutputFile("samtools_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'],
                           fnames=samtools_germline_vcf),
            mgd.OutputFile("samtools_germlines_anno.maf", 'sample_id',
                           fnames=samtools_germline_maf),
            mgd.OutputFile("roh_calls.csv.gz", 'sample_id',
                           fnames=roh_calls, extensions=['.yaml']),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id', fnames=normal_ids),
        ),
        kwargs={
            'single_node': single_node,
        }
    )

    workflow.subworkflow(
        name="freebayes_germline",
        func='wgs.workflows.freebayes.create_freebayes_germline_workflow',
        axes=('sample_id',),
        args=(
            mgd.OutputFile("freebayes_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'],
                           fnames=freebayes_germline_vcf),
            mgd.OutputFile("freebayes_germlines_anno.maf", 'sample_id', extensions=['.csi', '.tbi'],
                           fnames=freebayes_germline_maf),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
        }
    )

    workflow.subworkflow(
        name="rtg_germline",
        func='wgs.workflows.rtg_germline.create_rtg_germline_workflow',
        axes=('sample_id',),
        args=(
            mgd.OutputFile("rtg_germlines_anno.vcf.gz", 'sample_id', extensions=['.csi', '.tbi'],
                           fnames=rtg_germline_vcf),
            mgd.OutputFile("rtg_germlines_anno.maf", 'sample_id', extensions=['.csi', '.tbi'],
                           fnames=rtg_germline_maf),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            paths_refdir['reference'],
            paths_refdir['reference_sdf'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
        }
    )

    workflow.subworkflow(
        name="germline_consensus",
        func='wgs.workflows.germline_calling_consensus.create_germline_consensus_workflow',
        axes=('sample_id',),
        args=(
            mgd.InputFile('museq_germlines.vcf.gz', 'sample_id', fnames=museq_ss_vcf),
            mgd.InputFile("samtools_germlines_anno.vcf.gz", 'sample_id',
                          fnames=samtools_germline_vcf),
            mgd.InputFile("rtg_germlines_anno.vcf.gz", 'sample_id',
                          fnames=rtg_germline_vcf),
            mgd.InputFile("freebayes_germlines_anno.vcf.gz", 'sample_id',
                          fnames=freebayes_germline_vcf),
            mgd.OutputFile("germlines_consensus.maf", 'sample_id',
                           fnames=consensus_germline_maf),
            chromosomes,
            paths_refdir['reference_vep'],
            mgd.TempInputObj('normal_id', 'sample_id')
        ),
    )

    return workflow