Esempio n. 1
0
def create_mutect_workflow(normal_bam,
                           tumour_bam,
                           snv_vcf,
                           snv_maf,
                           reference,
                           reference_vep,
                           chromosomes,
                           normal_id,
                           tumour_id,
                           single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.mutect.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='mutect_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.mutect.tasks.run_mutect_one_job',
            args=(mgd.TempSpace("run_mutect_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam)),
        )
    else:
        workflow.transform(
            name='mutect_caller',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.mutect.tasks.run_mutect',
            args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam),
                  mgd.TempSpace('mutect_temp', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.workflows.mutect.tasks.merge_vcfs',
            args=(
                mgd.TempInputFile('mutect.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Esempio n. 2
0
def create_titan_workflow(
        tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs,
        parsed, plots, tar_outputs, museq_vcf,
        sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf,
        single_node=None
):
    cn_params = config.default_params('copynumber_calling')

    chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']]

    targets = mgd.InputFile(targets) if targets else None

    ctx = {'docker_image': config.containers('wgs')}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.titan.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='2:00', ),
        ret=mgd.OutputChunks('interval'),
        args=(
            reference,
            chromosomes,
        ),
        kwargs={'size': cn_params['split_size']}
    )

    if single_node:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='96:00',
                ncpus=8),
            func='wgs.utils.museq_utils.run_museq_one_job',
            args=(
                mgd.TempSpace("run_museq_temp"),
                mgd.OutputFile(museq_vcf),
                reference,
                mgd.InputChunks('interval'),
                cn_params['museq_params'],
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'museq_docker_image': config.containers('mutationseq'),
                'vcftools_docker_image': config.containers('vcftools')
            }
        )
    else:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00'),
            axes=('interval',),
            func='wgs.utils.museq_utils.run_museq',
            args=(
                mgd.TempOutputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq.log', 'interval'),
                reference,
                mgd.InputInstance('interval'),
                cn_params['museq_params']
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'docker_image': config.containers('mutationseq')
            }
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='4:00', ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.OutputFile(museq_vcf),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')}
        )

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.convert_museq_vcf2counts',
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            het_positions,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_correctreads_wig',
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            gc_wig,
            map_wig,
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='run_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='72:00',
            ncpus='8'),
        func='wgs.workflows.titan.tasks.run_titan',
        args=(
            mgd.TempInputFile('museq_postprocess.txt'),
            mgd.TempInputFile('correct_reads.txt'),
            mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy'),
            sample_id,
            map_wig,
            cn_params['titan_params'],
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan'), 'threads': '8'}
    )

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00', ),
        func='wgs.workflows.titan.tasks.plot_titan',
        args=(
            mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'),
            mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy')
        ),
        kwargs={
            'chromosomes': chromosomes,
            'docker_image': config.containers('titan'),
        },
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_cnsegments_titan',
        args=(
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
            sample_id,
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.annot_pygenes',
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            pygenes_gtf,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.parse_titan_data',
        args=(
            mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'),
        ),
    )

    # select optimal solution
    workflow.transform(
        name="select_optimal_solution",
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.select_optimal_solution",
        args=(
            chunks,
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(segs, extensions=['.yaml']),
            mgd.OutputFile(igv_segs, extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
            mgd.OutputFile(outfile, extensions=['.yaml']),
            mgd.OutputFile(parsed, extensions=['.yaml']),
            mgd.OutputFile(plots),
        )
    )

    workflow.transform(
        name='tar_all_data',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.tar_all_data",
        args=(
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(tar_outputs),
            mgd.TempSpace("titan_all_parameters_data"),
            chunks
        )
    )

    return workflow
Esempio n. 3
0
def alignment_workflow(args):
    config = inpututils.load_config(args)
    config = config['alignment']

    lib = args["library_id"]
    alignment_dir = args["out_dir"]
    bams_dir = args["bams_dir"]

    trim = args['trim']
    center = args['sequencing_center']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])

    cellids = inpututils.get_samples(args['input_yaml'])
    fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml'])

    alignment_files = get_output_files(alignment_dir, lib)
    alignment_meta = os.path.join(alignment_dir, 'metadata.yaml')

    bam_files_template = os.path.join(bams_dir, '{cell_id}.bam')
    mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam')
    bams_meta = os.path.join(bams_dir, 'metadata.yaml')

    lanes = sorted(set([v[1] for v in fastq1_files.keys()]))
    cells = sorted(set([v[0] for v in fastq1_files.keys()]))

    input_yaml_blob = os.path.join(alignment_dir, 'input.yaml')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq1_files.keys()),
    )

    workflow.subworkflow(
        name='alignment_workflow',
        func=align.create_alignment_workflow,
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq1_files,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq2_files,
                          axes_origin=[]),
            mgd.OutputFile('bam_markdups',
                           'cell_id',
                           template=bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile('mt_bam_markdups',
                           'cell_id',
                           template=mt_bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile(alignment_files['alignment_metrics_csv']),
            mgd.OutputFile(alignment_files['gc_metrics_csv']),
            mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
            mgd.OutputFile(alignment_files['plot_metrics_output']),
            config['ref_genome'],
            config,
            sampleinfo,
            cellids,
            mgd.OutputFile(alignment_files['alignment_metrics_tar']),
            lib,
            trim,
            center,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], alignment_dir, list(alignment_files.values()),
              mgd.OutputFile(alignment_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'alignment'
            }
        })

    workflow.transform(
        name='generate_meta_files_bams',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bams_dir,
              mgd.Template('aligned.bam',
                           'cell_id',
                           template=bam_files_template),
              mgd.OutputFile(bams_meta)),
        kwargs={
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'cellbams'
            },
            'template':
            (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'),
        })

    return workflow
Esempio n. 4
0
def create_museq_workflow(snv_vcf,
                          museqportrait_pdf,
                          reference,
                          chromosomes,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam':
                               tumour_bam,
                               'normal_bam':
                               normal_bam,
                               'museq_docker_image':
                               config.containers('mutationseq'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(
                               mgd.TempOutputFile('museq.vcf', 'interval'),
                               mgd.TempOutputFile('museq.log', 'interval'),
                               reference,
                               mgd.InputInstance('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                               'docker_image':
                               config.containers('mutationseq'),
                           })

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(snv_vcf, extensions=['.tbi',
                                                               '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'docker_image': config.containers('mutationseq'),
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    return workflow
Esempio n. 5
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Esempio n. 6
0
def merge_bams_workflow(args):
    config = inpututils.load_config(args)
    config = config['merge_bams']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    bam_files = inpututils.load_merge_cell_bams(args['input_yaml'])

    merge_out_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.transform(
        name="remove_softclipped_reads",
        func="single_cell.utils.pysamutils.remove_softclipped_reads",
        axes=('cell_id', ),
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.TempOutputFile('bam_rm_softclipped.bam',
                                 'cell_id',
                                 extensions=['.bai']),
              args['softclipped_reads_threshold']))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.TempInputFile('bam_rm_softclipped.bam',
                                               'cell_id',
                                               extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai'],
                                            template=merge_out_template),
                             mgd.InputChunks("region"),
                             config,
                         ))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=merge_out_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'template':
            (mgd.InputChunks('region'), merge_out_template, 'region'),
            'metadata': {
                'type': 'pseudowgs_regionbams',
                'cell_ids': list(bam_files.keys())
            }
        })

    return workflow
Esempio n. 7
0
def split_bam_workflow(workflow, args):

    config = helpers.load_config(args)

    info_file = os.path.join(args["out_dir"], 'results', 'split_bam',
                             'info.yaml')
    split_bam_template = args["split_bam_template"]
    split_bai_template = args["split_bam_template"] + ".bai"

    by_reads = False if "{region}" in split_bam_template else True
    splitkeyword = "region" if "{region}" in split_bam_template else "reads"

    if by_reads:
        splitnames = [str(i) for i in range(config["num_splits_byreads"])]

        workflow.setobj(
            obj=mgd.OutputChunks('reads'),
            value=splitnames,
        )

    else:
        workflow.transform(
            name="get_regions",
            ctx={
                'mem': 2,
                'num_retry': 3,
                'mem_retry_increment': 2,
                'pool_id': config['pools']['standard'],
                'ncpus': 1
            },
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.TempOutputObj('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))

    workflow.subworkflow(name="split_normal",
                         func=split_bams.create_split_workflow,
                         args=(
                             mgd.InputFile(args['wgs_bam']),
                             mgd.InputFile(args['wgs_bam'] + ".bai"),
                             mgd.OutputFile("normal.split.bam",
                                            splitkeyword,
                                            template=split_bam_template,
                                            axes_origin=[]),
                             mgd.OutputFile("normal.split.bam.bai",
                                            splitkeyword,
                                            template=split_bai_template,
                                            axes_origin=[]),
                             pypeliner.managed.TempInputObj(splitkeyword),
                             config,
                         ),
                         kwargs={"by_reads": by_reads})

    regions = mgd.InputChunks(
        'reads') if by_reads else pypeliner.managed.TempInputObj('region')
    workflow.transform(name="get_files",
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             split_bam_template, 'region'))

    metadata = {
        'split_bams': {
            'name': 'merge_bams',
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': pypeliner.managed.TempInputObj('outputs'),
            'input_datasets': args['wgs_bam'],
            'results': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(
                           mem=config['memory']['med'],
                           pool_id=config['pools']['standard'],
                       ),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
Esempio n. 8
0
def copy_number_calling_workflow(args):

    config = helpers.load_config(args)
    config = config['copy_number_calling']

    pyp = pypeliner.app.Pypeline(config=args)

    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1,
           'docker_image': config['docker']['single_cell_pipeline']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    assert '{region}' in normal_wgs


    copynumber_dir = os.path.join(args["out_dir"], "copynumber")

    out_file = os.path.join(copynumber_dir, "results", "results.h5")

    cloneid = args["clone_id"]

    remixt_config = config.get('extract_seqdata', {})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.transform(
        name="get_regions",
        ctx=dict(mem=config['memory']['low']),
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=mgd.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        )
    )

    workflow.transform(
        name="get_snp_positions_filename",
        func="remixt.config.get_filename",
        ret=mgd.TempOutputObj('snp_positions_filename'),
        args=(
              remixt_config,
              config['ref_data_dir'],
              'snp_positions'
        )
    )

    workflow.transform(
        name="get_bam_max_fragment_length",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_fragment_length'),
        args=(
              remixt_config,
              'bam_max_fragment_length'
        )
    )

    workflow.transform(
        name="get_bam_max_soft_clipped",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_soft_clipped'),
        args=(
              remixt_config,
              'bam_max_soft_clipped'
        )
    )

    workflow.transform(
        name="get_bam_check_proper_pair",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_check_proper_pair'),
        args=(
              remixt_config,
              'bam_check_proper_pair'
        )
    )


    workflow.subworkflow(
        name="extract_seqdata_tumour",
        axes=('tumour_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'tumour_cell_id',
                fnames=tumour_cells,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("tumour.h5", "tumour_cell_id"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config
        )
    )

    workflow.subworkflow(
        name="extract_seqdata_normal",
        axes=('region',),
        ctx={'disk': 200},
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'region',
                template=normal_wgs,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("normal.h5", "region"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config,
        )
    )

    workflow.subworkflow(
        name='titan_workflow',
        func=titan.create_titan_workflow,
        args=(
            mgd.TempInputFile("normal.h5", "region"),
            mgd.TempInputFile("tumour.h5", "tumour_cell_id"),
            config['ref_genome'],
            copynumber_dir,
            mgd.OutputFile(out_file),
            config,
            args,
            tumour_cells.keys(),
            mgd.InputChunks('region'),
            cloneid
        ),
    )

    pyp.run(workflow)
Esempio n. 9
0
def create_freebayes_germline_workflow(germline_vcf,
                                       germline_maf,
                                       bam_file,
                                       reference,
                                       reference_vep,
                                       chromosomes,
                                       normal_id,
                                       single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.freebayes.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='freebayes_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.freebayes.tasks.run_freebayes_one_job',
            args=(mgd.TempSpace("run_freebayes_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(bam_file)))
    else:
        workflow.transform(
            name='freebayes',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.freebayes.tasks.run_freebayes_germline',
            args=(mgd.TempOutputFile('freebayes_germline.vcf',
                                     'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(bam_file),
                  mgd.TempSpace('tempdir_freebayes', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('freebayes_germline.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(germline_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="freebayes_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(germline_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(germline_maf,
                                            extensions=['.tbi', '.csi']),
                             reference_vep,
                         ),
                         kwargs={'normal_id': normal_id})

    return workflow
Esempio n. 10
0
def split_bam_workflow(args):
    config = inpututils.load_config(args)
    config = config['split_bam']

    bam_file = inpututils.load_split_wgs_input(args['input_yaml'])

    baseimage = config['docker']['single_cell_pipeline']

    split_bam_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage})

    workflow.transform(
        name="get_regions",
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(
        name="split_normal",
        func=split_bams.create_split_workflow,
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1
        },
        args=(
            mgd.InputFile(bam_file),
            mgd.OutputFile("normal.split.bam",
                           'region',
                           template=split_bam_template,
                           axes_origin=[]),
            pypeliner.managed.InputChunks('region'),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=split_bam_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'wgs_regionbams'
            },
            'template':
            (mgd.InputChunks('region'), split_bam_template, 'region'),
        })

    return workflow
Esempio n. 11
0
def create_samtools_germline_workflow(germline_vcf,
                                      germline_roh,
                                      bam_file,
                                      reference,
                                      chromosomes,
                                      single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.samtools_germline.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func=
            'wgs.workflows.samtools_germline.tasks.run_samtools_germline_one_job',
            args=(mgd.TempSpace("run_samtools_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(bam_file)),
            kwargs={
                'samtools_docker_image': config.containers('samtools'),
                'vcftools_docker_image': config.containers('vcftools')
            })
    else:
        workflow.transform(
            name='samtools_germline',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.samtools_germline.tasks.run_samtools_germline',
            args=(mgd.TempOutputFile('germline.vcf.gz', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(bam_file)),
            kwargs={'docker_image': config.containers('samtools')})

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('germline.vcf.gz', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(germline_vcf,
                                          extensions=['.tbi', '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='roh_calling',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.workflows.samtools_germline.tasks.roh_calling',
        args=(mgd.InputFile(germline_vcf, extensions=['.tbi', '.csi']),
              mgd.OutputFile(germline_roh)),
        kwargs={'docker_image': config.containers('vcftools')})

    return workflow