Beispiel #1
0
def create_battenberg_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    somatic_breakpoint_file=None,
    normal_id=None,
    **kwargs
):
    if normal_id is None:
        raise ValueError('cloneHD requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.subworkflow(
        name='run_battenberg',
        axes=('sample_id',),
        func=create_battenberg_single_workflow,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files),
            normal_id,
            pypeliner.managed.InputInstance('sample_id'),
            pypeliner.managed.OutputFile('results', 'sample_id', template=results_files),
            config,
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results', 'sample_id', template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Beispiel #2
0
def create_samtools_germline_workflow(
        normal_bam_files,
        normal_bai_files,
        ref_genome_fasta_file,
        vcf_file,
        config,
        chromosomes=default_chromosomes,
        base_docker=None,
        samtools_docker=None,
        vcftools_docker=None
):

    ctx = {'mem': config["memory"]['low'],
           'pool_id': config['pools']['standard'],
           'mem_retry_increment': 2,
           'ncpus': 1}
    if base_docker:
        ctx.update(base_docker)

    regions = normal_bam_files.keys()

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        ctx=ctx,
        axes=('regions',),
        func="single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files),
            pypeliner.managed.InputFile('normal.split.bam.bai', 'regions', fnames=normal_bai_files),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )
  
    workflow.transform(
        name='concatenate_variants',
        ctx=ctx,
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker}
    )

    return workflow
Beispiel #3
0
def create_ascat_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('ASCAT requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'),
            config,
        ),
    )

    return workflow
def create_samtools_germline_workflow(normal_bam_files,
                                      ref_genome_fasta_file,
                                      vcf_file,
                                      config,
                                      samtools_docker=None,
                                      vcftools_docker=None):
    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem': config["memory"]['low'],
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'docker_image': baseimage
    }

    regions = list(normal_bam_files.keys())

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        axes=('regions', ),
        func=
        "single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam',
                                        'regions',
                                        fnames=normal_bam_files,
                                        extensions=['.bai']),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )

    workflow.transform(
        name='concatenate_variants',
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker})

    return workflow
Beispiel #5
0
def create_download_workflow(url, file_name):

    workflow = Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('url'), value=url)

    workflow.transform(name='download',
                       ctx={'local': True},
                       func=tasks.download_from_url,
                       args=(pypeliner.managed.TempInputObj('url'),
                             pypeliner.managed.OutputFile(file_name)))

    return workflow
Beispiel #6
0
def call_and_annotate_pipeline(config,
                               normal_bam_path,
                               tumour_bam_paths,
                               raw_data_dir,
                               results_file,
                               chromosomes=default_chromosomes):

    workflow = Workflow()

    workflow.setobj(
        pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[
            0,
        ]), tumour_bam_paths.keys())

    variant_files = get_variant_files(chromosomes, config, raw_data_dir)

    normal_bam_file = pypeliner.managed.File(normal_bam_path)

    tumour_bam_files = pypeliner.managed.File('tumour_bams',
                                              'tumour_sample_id',
                                              fnames=tumour_bam_paths)

    ref_genome_fasta_file = pypeliner.managed.File(
        config['databases']['ref_genome']['local_path'])

    #===================================================================================================================
    # Multi sample calling
    #===================================================================================================================
    if 'nuseq_multi_sample' in config:
        workflow.subworkflow(
            name='nuseq_multi_sample',
            axes=(),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(
                normal_bam_file.as_input(), [
                    pypeliner.managed.InputFile(x)
                    for x in tumour_bam_paths.values()
                ], ref_genome_fasta_file.as_input(),
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()),
            kwargs=config['nuseq_multi_sample']['kwargs'])

        workflow.transform(
            name='convert_nuseq_multi_sample_vcf_to_hdf5',
            axes=(),
            ctx=default_ctx,
            func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
            args=(
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(),
                variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(),
                '/snv/vcf/nuseq_multi_sample/all',
            ),
            kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']})

    #===================================================================================================================
    # Single sample calling
    #===================================================================================================================
    if 'nuseq' in config:
        workflow.subworkflow(
            name='nuseq',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(normal_bam_file.as_input(), [
                tumour_bam_files.as_input(),
            ], ref_genome_fasta_file.as_input(),
                  variant_files['snv']['vcf']['nuseq'].as_output()),
            kwargs=config['nuseq']['kwargs'])

    if 'mutect' in config:
        workflow.subworkflow(
            name='mutect',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.mutect.create_mutect_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  config['databases']['cosmic']['local_path'],
                  config['databases']['dbsnp']['local_path'],
                  variant_files['snv']['vcf']['mutect'].as_output()),
            kwargs=config['mutect']['kwargs'])

    if 'strelka' in config:
        workflow.subworkflow(
            name='strelka',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.strelka.create_strelka_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  variant_files['indel']['vcf']['strelka'].as_output(),
                  variant_files['snv']['vcf']['strelka'].as_output()),
            kwargs=config['strelka']['kwargs'])

    #===================================================================================================================
    # Convert vcf to hdf5
    #===================================================================================================================
    for var_type in variant_files:
        for prog in variant_files[var_type]['vcf']:
            if prog == 'nuseq_multi_sample':
                continue

            workflow.transform(
                name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type),
                axes=('tumour_sample_id', ),
                ctx=default_ctx,
                func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
                args=(variant_files[var_type]['vcf'][prog].as_input(),
                      variant_files[var_type]['hdf'][prog].as_output(),
                      pypeliner.managed.Template(
                          '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format(
                              prog=prog, var_type=var_type),
                          'tumour_sample_id')),
                kwargs={'score_callback': vcf_score_callbacks[var_type][prog]})

    #===================================================================================================================
    # Indel annotation
    #===================================================================================================================
    workflow.transform(
        name='merge_indels',
        ctx=big_mem_ctx,
        func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs',
        args=([x.as_input() for x in variant_files['indel']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.indel.vcf')))

    workflow.transform(
        name='finalise_indels',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.indel.vcf'),
              pypeliner.managed.TempOutputFile('all.indel.vcf.gz')))

    workflow.subworkflow(
        name='annotate_indels',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.indel.vcf.gz'),
            pypeliner.managed.TempOutputFile('indel_annotations.h5'),
            os.path.join(raw_data_dir, 'indel'),
        ),
        kwargs={'variant_type': 'indel'})

    #===================================================================================================================
    # SNV
    #===================================================================================================================
    workflow.transform(
        name='merge_snvs',
        ctx=big_mem_ctx,
        func="biowrappers.components.io.vcf.tasks.merge_vcfs",
        args=([x.as_input() for x in variant_files['snv']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.snv.vcf')))

    workflow.transform(
        name='finalise_snvs',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.snv.vcf'),
              pypeliner.managed.TempOutputFile('all.snv.vcf.gz')))

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.TempOutputFile('snv_annotations.h5'),
            os.path.join(raw_data_dir, 'snv'),
        ),
        kwargs={'variant_type': 'snv'})

    workflow.subworkflow(
        name='normal_snv_counts',
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            normal_bam_file.as_input(),
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        ),
        kwargs=get_kwargs(config['snv_counts']['kwargs'],
                          '/snv/counts/normal'))

    workflow.subworkflow(
        name='tumour_snv_counts',
        axes=('tumour_sample_id', ),
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(tumour_bam_files.as_input(),
              pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'snv', 'counts',
                               '{tumour_sample_id}.h5'), 'tumour_sample_id')),
        kwargs=get_kwargs(
            config['snv_counts']['kwargs'],
            pypeliner.managed.Template('/snv/counts/{tumour_sample_id}',
                                       'tumour_sample_id')))

    #===================================================================================================================
    # Create final output
    #===================================================================================================================
    tables = [
        pypeliner.managed.TempInputFile('indel_annotations.h5'),
        pypeliner.managed.TempInputFile('snv_annotations.h5'),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts',
                         '{tumour_sample_id}.h5'), 'tumour_sample_id'),
    ]

    for var_type in variant_files:
        for prog in variant_files[var_type]['hdf']:
            tables.append(variant_files[var_type]['hdf'][prog].as_input())

    workflow.transform(
        name='build_results_file',
        ctx=default_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(tables, pypeliner.managed.OutputFile(results_file)),
        kwargs={
            'drop_duplicates': True,
        })

    return workflow
Beispiel #7
0
def create_theta_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Theta requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_template = os.path.join(raw_data_dir, 'results',
                                    'sample_{sample_id}.h5')
    bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2',
                                        'bicseq2_{sample_id}.seg')
    utils.make_parent_directory(results_template)
    utils.make_parent_directory(bicseq2_seg_template)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='run_bicseq2',
        axes=('sample_id', ),
        ctx={'mem': 30},
        func=tasks.run_bicseq2_seg,
        args=(
            pypeliner.managed.OutputFile('bicseq2_seg',
                                         'sample_id',
                                         template=bicseq2_seg_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            config,
            pypeliner.managed.TempSpace('bicseq2_work',
                                        'sample_id',
                                        cleanup=None),
        ),
    )

    workflow.transform(
        name='run_theta',
        axes=('sample_id', ),
        ctx={'mem': 32},
        func=tasks.run_theta,
        args=(
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.InputFile('bicseq2_seg',
                                        'sample_id',
                                        template=bicseq2_seg_template),
            config,
            pypeliner.managed.TempSpace('theta_work',
                                        'sample_id',
                                        cleanup=None),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
            'num_clones': kwargs.get('num_clones', None),
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_template),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            config,
                            chromosomes=default_chromosomes,
                            split_size=int(1e7),
                            use_depth_thresholds=True):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'num_retry': 3,
        'docker_image': config['docker']['single_cell_pipeline']
    }

    strelka_docker = {'docker_image': config['docker']['strelka']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    regions = list(normal_bam_file.keys())
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
              strelka_docker))

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4, disk=40),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region', ),
        args=(pypeliner.managed.InputFile("normal.split.bam",
                                          "region",
                                          fnames=normal_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.InputFile("merged_bam",
                                          "region",
                                          fnames=tumour_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.TempInputObj('known_sizes'),
              ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
              pypeliner.managed.InputInstance("region"), strelka_docker),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'region'),
              pypeliner.managed.TempInputFile('strelka.stats', 'region'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.InputInstance("chrom"),
              pypeliner.managed.TempInputObj('known_sizes'), regions),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                            'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                             'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"), vcftools_docker))

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                              'chrom'),
              pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempSpace("merge_snvs_temp"), vcftools_docker))

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
              pypeliner.managed.OutputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
              pypeliner.managed.OutputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    return workflow
Beispiel #9
0
def call_and_annotate_pipeline(
    config,
    normal_bam_path,
    tumour_bam_paths,
    raw_data_dir,
    results_file,
):
    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_sample_id'),
        value=tumour_bam_paths.keys(),
    )

    merge_inputs = {}

    if 'destruct' in config:
        destruct_raw_data = os.path.join(raw_data_dir, 'destruct')
        destruct_results_filename = os.path.join(destruct_raw_data,
                                                 'results.h5')
        make_parent_directory(destruct_results_filename)

        workflow.subworkflow(
            name='destruct',
            func=destruct.destruct_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
                pypeliner.managed.OutputFile(destruct_results_filename),
                destruct_raw_data,
            ),
        )

        merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile(
            destruct_results_filename)

    if 'delly' in config:
        delly_raw_data = os.path.join(raw_data_dir, 'delly')
        delly_results_filename = os.path.join(delly_raw_data, 'results.h5')
        make_parent_directory(delly_results_filename)

        workflow.subworkflow(
            name='delly',
            func=delly.delly_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['delly']['ref_genome_fasta_file'],
                config['delly']['exclude_file'],
                pypeliner.managed.OutputFile(delly_results_filename),
                delly_raw_data,
            ),
        )

        merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile(
            delly_results_filename)

    if 'lumpysv' in config:
        lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv')
        lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5')
        make_parent_directory(lumpysv_results_filename)

        workflow.subworkflow(
            name='lumpysv',
            func=lumpysv.lumpysv_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                pypeliner.managed.OutputFile(lumpysv_results_filename),
                lumpysv_raw_data,
            ),
        )

        merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile(
            lumpysv_results_filename)

    workflow.transform(name='merge_results',
                       ctx={'mem': 8},
                       func=hdf5_tasks.merge_hdf5,
                       args=(
                           merge_inputs,
                           pypeliner.managed.OutputFile(results_file),
                       ))

    return workflow
Beispiel #10
0
def realignment_pipeline(
        config,
        in_file,
        out_file,
        read_group_info=None):

    if read_group_info is None:
        read_group_info = config.get('read_group', {})

    if 'ID' not in read_group_info:
        read_group_info['ID'] = hash(in_file) % int(1e6)

    ref_genome = pypeliner.managed.InputFile(config['ref_genome']['file'])

    read_1 = pypeliner.managed.TempFile('read_1', 'split')

    read_2 = pypeliner.managed.TempFile('read_2', 'split')

    read_1_sai = pypeliner.managed.TempFile('read_1.sai', 'split')

    read_2_sai = pypeliner.managed.TempFile('read_2.sai', 'split')

    read_group_config = pypeliner.managed.TempObj('read_group_config')

    workflow = Workflow()

    if 'read_group' in config:
        workflow.setobj(
            obj=read_group_config.as_output(),
            value=read_group_info,
        )

    else:
        workflow.transform(
            name='get_read_group_config',
            ctx={'local': True},
            func=tasks.get_read_group_config,
            ret=read_group_config.as_output(),
            args=(
                pypeliner.managed.InputFile(in_file),
            )
        )

    workflow.transform(
        name='bam_to_fasta',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.convert_to_fastqs,
        args=(
            pypeliner.managed.InputFile(in_file),
            {
                1: read_1.as_output(),
                2: read_2.as_output(),
            },
            pypeliner.managed.TempSpace('bam_to_fastq'),
        ),
        kwargs={
            'split_size': config['split_size']
        },
    )

    workflow.transform(
        name='aln_read_1',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_1.as_input(),
            ref_genome,
            read_1_sai.as_output(),
        ),
    )

    workflow.transform(
        name='aln_read_2',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_2.as_input(),
            ref_genome,
            read_2_sai.as_output(),
        ),
    )

    workflow.transform(
        name='sampe',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_sampe,
        args=(
            read_1.as_input(),
            read_2.as_input(),
            read_1_sai.as_input(),
            read_2_sai.as_input(),
            ref_genome,
            pypeliner.managed.TempOutputFile('aligned.bam', 'split'),
        ),
        kwargs={
            'read_group_info': read_group_config.as_input()
        },
    )

    workflow.transform(
        name='sort',
        axes=('split',),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.sort,
        args=(
            pypeliner.managed.TempInputFile('aligned.bam', 'split'),
            pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
        ),
    )

    workflow.transform(
        name='write_header_file',
        axes=(),
        ctx={'local': True},
        func=tasks.write_header_file,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.TempOutputFile('header.sam'),
            config['ref_genome']['header']
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.merge,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'header_file': pypeliner.managed.TempInputFile('header.sam'),
        },
    )

    return workflow
Beispiel #11
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            snv_csv_file,
                            chromosomes=default_chromosomes,
                            use_depth_thresholds=True):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'num_retry': 3
    }

    regions = list(normal_bam_file.keys())
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ))

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4, disk=40),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region', ),
        args=(
            pypeliner.managed.InputFile("normal.split.bam",
                                        "region",
                                        fnames=normal_bam_file,
                                        extensions=['.bai']),
            pypeliner.managed.InputFile("merged_bam",
                                        "region",
                                        fnames=tumour_bam_file,
                                        extensions=['.bai']),
            pypeliner.managed.TempInputObj('known_sizes'),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                             'region'),
            pypeliner.managed.TempOutputFile(
                'somatic.indels.unfiltered.vcf.window', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                             'region'),
            pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
            pypeliner.managed.InputInstance("region"),
        ),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'region'),
              pypeliner.managed.TempInputFile('strelka.stats', 'region'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.InputInstance("chrom"),
              pypeliner.managed.TempInputObj('known_sizes'), regions),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                            'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                             'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"),
        ))

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_snvs_temp"),
        ))

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
            pypeliner.managed.OutputFile(indel_vcf_file,
                                         extensions=['.tbi', '.csi']),
        ))

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
            pypeliner.managed.OutputFile(snv_vcf_file,
                                         extensions=['.tbi', '.csi']),
        ))

    workflow.transform(
        name='convert_strelka_to_csv',
        func="biowrappers.components.io.vcf.tasks.convert_vcf_to_csv",
        ctx=ctx,
        args=(
            pypeliner.managed.InputFile(snv_vcf_file),
            pypeliner.managed.TempOutputFile('strelka_snv.csv'),
        ),
        kwargs={
            'score_callback': strelka_snv_callback,
        })

    workflow.transform(
        name='prep_strelka_csv',
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(pypeliner.managed.TempInputFile('strelka_snv.csv'),
              pypeliner.managed.OutputFile(snv_csv_file,
                                           extensions=['.yaml'])),
        kwargs={'dtypes': dtypes()['snv_strelka']})

    return workflow
Beispiel #12
0
def create_mutect_workflow(
        normal_bam_file,
        tumour_bam_file,
        ref_genome_fasta_file,
        cosmic_vcf_file,
        dbsnp_vcf_file,
        out_file,
        chromosomes=default_chromosomes,
        split_size=int(1e7)):

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('regions', 'regions'),
        value=utils.get_bam_regions(tumour_bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.transform(
        name='run_classify',
        axes=('regions',),
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 2, 'io': 1},
        func=tasks.run_mutect,
        args=(
            pypeliner.managed.InputFile(normal_bam_file),
            pypeliner.managed.InputFile(tumour_bam_file),
            pypeliner.managed.InputFile(ref_genome_fasta_file),
            pypeliner.managed.InputFile(cosmic_vcf_file),
            pypeliner.managed.InputFile(dbsnp_vcf_file),
            pypeliner.managed.TempInputObj('regions', 'regions'),
            pypeliner.managed.TempOutputFile('classified.vcf', 'regions')
        ),
    )

    workflow.transform(
        name='merge_vcf',
        ctx={'mem': 16, 'num_retry': 3, 'mem_retry_increment': 8},
        func=vcf_tasks.concatenate_vcf,
        args=(
            pypeliner.managed.TempInputFile('classified.vcf', 'regions'),
            pypeliner.managed.TempOutputFile('merged.vcf.gz'),
        ),
        kwargs={
            'bcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.csi'),
            'vcf_index_file': pypeliner.managed.TempOutputFile('merged.vcf.gz.tbi'),
        }
    )

    workflow.transform(
        name='filter_snvs',
        ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2},
        func=vcf_tasks.filter_vcf,
        args=(
            pypeliner.managed.TempInputFile('merged.vcf.gz'),
            pypeliner.managed.TempOutputFile('merged.filtered.vcf')
        )
    )

    workflow.transform(
        name='finalise',
        func=vcf_tasks.finalise_vcf,
        args=(
            pypeliner.managed.TempInputFile('merged.filtered.vcf'),
            pypeliner.managed.OutputFile(out_file)
        )
    )

    return workflow
Beispiel #13
0
def create_titan_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Titan requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.wig'),
            pypeliner.managed.TempOutputFile('het_positions.tsv'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempInputFile('het_positions.tsv'),
            pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv',
                                             'sample_id'),
            config,
        ),
    )

    workflow.transform(
        name='create_intialization_parameters',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.create_intialization_parameters,
        ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id',
                                            'init_param_id'),
        args=(config, ),
    )

    workflow.transform(
        name='run_titan',
        axes=('sample_id', 'init_param_id'),
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.run_titan,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('normal.wig'),
            pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'),
            pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id',
                                             'init_param_id'),
            pypeliner.managed.TempOutputFile('params.tsv', 'sample_id',
                                             'init_param_id'),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='select_solution',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.select_solution,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('cn.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.TempInputFile('params.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_files),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_segments.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            config,
            pypeliner.managed.Template('{sample_id}', 'sample_id'),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
        },
    )

    workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id',
                                                       'chromosome'),
                    value=config.get('chromosomes', default_chromosomes),
                    axes=('sample_id', ))

    workflow.commandline(
        name='plot_chromosome',
        axes=('sample_id', 'chromosome'),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            'plot_titan_chromosome.R',
            pypeliner.managed.Instance('chromosome'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_chr_{chromosome}.png'), 'sample_id',
                'chromosome'),
        ),
    )

    workflow.transform(
        name='merge_results',
        ctx={
            'mem': 8,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Beispiel #14
0
def main(args):
    biowrappers.components.utils.make_directory(args.out_dir)

    with open(args.config_file) as config_file:
        config_text = config_file.read()
    config_text = config_text.format(out_dir=args.out_dir, ref_db_dir=args.ref_db_dir)
    config = yaml.load(config_text)

    pypeliner_args = vars(args)
    pypeliner_args['tmpdir'] = os.path.join(args.out_dir, 'pipeline')

    pyp = pypeliner.app.Pypeline(modules=[tasks], config=pypeliner_args)

    download_urls = {}

    for sample in ('tumour', 'normal'):
        lanes = config['lanes'][sample]

        for lane in lanes:
            download_urls[(sample, lane)] = config['lanes'][sample][lane]['url']

    raw_lane_template = os.path.join(args.out_dir, 'lanes', 'raw', '{lane}.bam')

    realigned_lane_template = os.path.join(args.out_dir, 'lanes', 'realigned', '{lane}.bam')
    sample_bam_template = os.path.join(args.out_dir, '{sample}.bam')

    workflow = Workflow(default_ctx={'mem': 8})

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('url', 'sample', 'lane'),
        value=download_urls,
    )

    workflow.subworkflow(
        name='download_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            pypeliner.managed.TempInputObj('url', 'sample', 'lane'),
            pypeliner.managed.OutputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
        )
    )

    workflow.subworkflow(
        name='realign_lanes',
        axes=('sample', 'lane'),
        func=biowrappers.pipelines.realignment.realignment_pipeline,
        args=(
            config['realignment'],
            pypeliner.managed.InputFile('raw_lane', 'sample', 'lane', template=raw_lane_template),
            pypeliner.managed.OutputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
        )
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('sample',),
        func=biowrappers.components.io.bam.tasks.mark_duplicates,
        args=(
            pypeliner.managed.InputFile('realigned_lane', 'sample', 'lane', template=realigned_lane_template),
            pypeliner.managed.OutputFile('bam', 'sample', template=sample_bam_template),
        ),
        kwargs={
            'tmp_dir': pypeliner.managed.TempSpace('markdup_temp', 'sample')
        }
    )

    pyp.run(workflow)

    normal_bam_file = sample_bam_template.format(sample='normal')
    tumour_bam_file = sample_bam_template.format(sample='tumour')

    workflow = Workflow(default_ctx={'mem': 8})

    breakpoint_raw_data_dir = os.path.join(args.out_dir, 'breakpoints', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'breakpoints', 'results.h5')

    workflow.subworkflow(
        name='breakpoint_call_and_annotate',
        func=biowrappers.pipelines.breakpoint_call_and_annotate.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            pypeliner.managed.Template(os.path.join(breakpoint_raw_data_dir)),
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
    )

    somatic_breakpoints_file = os.path.join(args.out_dir, 'somatic_breakpoints.tsv')

    workflow.transform(
        name='extract_somatic_breakpoint',
        ctx={'mem': 4},
        func=tasks.extract_somatic_breakpoint,
        args=(
            pypeliner.managed.InputFile(breakpoint_results_file),
            pypeliner.managed.OutputFile(somatic_breakpoints_file),
            config,
        )
    )

    copy_number_raw_data_dir = os.path.join(args.out_dir, 'copy_number', 'raw')
    breakpoint_results_file = os.path.join(args.out_dir, 'copy_number', 'results.h5')

    workflow.subworkflow(
        name='copy_number_call_and_annotate',
        func=biowrappers.pipelines.copy_number.call_and_annotate_pipeline,
        args=(
            config,
            pypeliner.managed.InputFile(normal_bam_file),
            {'tumour': pypeliner.managed.InputFile(tumour_bam_file)},
            copy_number_raw_data_dir,
            pypeliner.managed.OutputFile(breakpoint_results_file),
        ),
        kwargs={
            'somatic_breakpoint_file': pypeliner.managed.InputFile(somatic_breakpoints_file),
        },
    )

    pyp.run(workflow)
Beispiel #15
0
def delly_pipeline(
    normal_bam_file,
    tumour_bam_files,
    ref_genome_fasta_file,
    delly_excl_chrom,
    out_file,
    raw_data_dir,
):
    bams = list()
    for lib_id, bam_filename in tumour_bam_files.items():
        bams += [
            utils.symlink(bam_filename,
                          link_name='{0}.bam'.format(lib_id),
                          link_directory=raw_data_dir)
        ]
        utils.symlink(bam_filename + '.bai',
                      link_name='{0}.bam.bai'.format(lib_id),
                      link_directory=raw_data_dir)

    bams += [
        utils.symlink(normal_bam_file,
                      link_name='Normal.bam',
                      link_directory=raw_data_dir)
    ]
    utils.symlink(normal_bam_file + '.bai',
                  link_name='Normal.bam.bai',
                  link_directory=raw_data_dir)

    sample_type = {'Normal': 'control'}
    for lib_id in tumour_bam_files.keys():
        sample_type[lib_id] = 'tumor'

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('sample_type', 'sample_id'),
        value=sample_type,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sv_type'),
        value=('DEL', 'DUP', 'INV', 'TRA', 'INS'),
    )

    workflow.transform(
        name='delly_call',
        axes=('sv_type', ),
        ctx={
            'mem': 64,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_call,
        args=(
            mgd.Instance('sv_type'),
            delly_excl_chrom,
            ref_genome_fasta_file,
            [mgd.InputFile(bam) for bam in bams],
            mgd.TempOutputFile('out.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='write_samples_table',
        ctx={'mem': 1},
        func=tasks.write_samples_table,
        args=(
            mgd.TempInputObj('sample_type', 'sample_id'),
            mgd.TempOutputFile('samples.tsv'),
        ),
    )

    workflow.transform(
        name='delly_filter_somatic',
        axes=('sv_type', ),
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_filter,
        args=(
            mgd.Instance('sv_type'),
            mgd.TempInputFile('samples.tsv'),
            ref_genome_fasta_file,
            mgd.TempInputFile('out.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='concatenate_vcf',
        func=vcf_tasks.concatenate_bcf,
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf'),
        ),
    )

    workflow.transform(
        name='convert_vcf',
        func=tasks.convert_vcf,
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Beispiel #16
0
def destruct_pipeline(
    normal_bam_file,
    tumour_bam_files,
    config,
    ref_data_dir,
    out_file,
    raw_data_dir,
    normal_sample_id='normal',
):
    bam_files = tumour_bam_files
    bam_files[normal_sample_id] = normal_bam_file

    utils.make_directory(os.path.join(raw_data_dir, 'raw'))
    breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv')
    breakpoint_library_file = os.path.join(raw_data_dir, 'raw',
                                           'breakpoint_library.tsv')
    breakpoint_read_file = os.path.join(raw_data_dir, 'raw',
                                        'breakpoint_read.tsv')

    utils.make_directory(os.path.join(raw_data_dir, 'somatic'))
    somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic',
                                           'breakpoint.tsv')
    somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic',
                                                   'breakpoint_library.tsv')

    raw_read_data_dir = os.path.join(raw_data_dir, 'read_data')
    utils.make_directory(raw_read_data_dir)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='run_destruct',
        func="destruct.workflow.create_destruct_workflow",
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile(breakpoint_file),
            pypeliner.managed.OutputFile(breakpoint_library_file),
            pypeliner.managed.OutputFile(breakpoint_read_file),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_read_data_dir,
        },
    )

    workflow.transform(
        name='filter_annotate_breakpoints',
        ctx={'mem': 8},
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints',
        args=(
            pypeliner.managed.InputFile(breakpoint_file),
            pypeliner.managed.InputFile(breakpoint_library_file),
            [normal_sample_id],
            pypeliner.managed.OutputFile(somatic_breakpoint_file),
            pypeliner.managed.OutputFile(somatic_breakpoint_library_file),
        ),
    )

    workflow.transform(
        name='write_store',
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.write_store',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            pypeliner.managed.InputFile(somatic_breakpoint_file),
            pypeliner.managed.InputFile(somatic_breakpoint_library_file),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Beispiel #17
0
def create_vardict_paired_sample_workflow(normal_bam_file,
                                          tumour_bam_file,
                                          ref_genome_fasta_file,
                                          out_file,
                                          chromosomes=default_chromosomes,
                                          java=False,
                                          min_allele_frequency=0.01,
                                          remove_duplicate_reads=False,
                                          sample_names=None,
                                          split_size=int(1e7)):

    workflow = Workflow()
    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=utils.get_bam_regions(normal_bam_file,
                                                split_size,
                                                chromosomes=chromosomes))
    workflow.transform(
        name='run_vardict',
        axes=('regions', ),
        ctx={
            'mem': 12,
            'num_retry': 4,
            'mem_retry_increment': 2
        },
        func=tasks.run_paired_sample_vardict,
        args=(
            pypeliner.managed.InputFile(normal_bam_file),
            pypeliner.managed.InputFile(tumour_bam_file),
            pypeliner.managed.InputFile(ref_genome_fasta_file),
            pypeliner.managed.TempInputObj('config', 'regions'),
            pypeliner.managed.TempOutputFile('result.vcf', 'regions'),
        ),
        kwargs={
            'java': java,
            'min_allele_frequency': min_allele_frequency,
            'remove_duplicate_reads': remove_duplicate_reads,
            'sample_names': sample_names,
        },
    )
    workflow.transform(
        name='compress_tmp',
        axes=('regions', ),
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.compress_vcf,
        args=(
            pypeliner.managed.TempInputFile('result.vcf', 'regions'),
            pypeliner.managed.TempOutputFile('result.vcf.gz', 'regions'),
        ),
        kwargs={
            'index_file':
            pypeliner.managed.TempOutputFile('result.vcf.gz.tbi', 'regions'),
        })
    workflow.transform(
        name='concatenate_vcf',
        ctx={
            'mem': 2,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            pypeliner.managed.TempInputFile('result.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'bcf_index_file': pypeliner.managed.OutputFile(out_file + '.csi'),
            'vcf_index_file': pypeliner.managed.OutputFile(out_file + '.tbi'),
        },
    )
    return workflow