Esempio n. 1
0
def realignment_readgroups_pipeline(
        config,
        in_file,
        out_file):

    workflow = Workflow()

    workflow.transform(
        name='get_read_group_configs',
        func=tasks.get_read_group_configs,
        ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'),
        args=(
            pypeliner.managed.InputFile(in_file),
        )
    )

    workflow.commandline(
        name='create_read_group_bam',
        axes=('read_group_id',),
        args=(
            'samtools', 'view', '-b',
            '-r', pypeliner.managed.InputInstance('read_group_id'),
            pypeliner.managed.InputFile(in_file),
            '>',
            pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'),
        )
    )

    workflow.subworkflow(
        name='realignment_pipeline',
        axes=('read_group_id',),
        func=realignment_pipeline,
        args=(
            config,
            pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'),
            pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'),
        ),
        kwargs={
            'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'),
        }
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('read_group_id',),
        ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16},
        func=bam_tasks.mark_duplicates,
        args=(
            pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id')
        }
    )

    return workflow
Esempio n. 2
0
def create_setup_theta_workflow(config, databases, **kwargs):
    mappability_dir = os.path.realpath(
        os.path.join(os.path.dirname(config['mappability_template']),
                     os.pardir))
    map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log')
    chromosomes_dir = os.path.dirname(config['chromosome_template'])

    utils.make_directory(mappability_dir)
    utils.make_directory(chromosomes_dir)

    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.TempOutputFile('mappability.tar.gz'),
        ))

    workflow.commandline(
        name='extract_mappability',
        args=(
            'tar',
            '-xzvf',
            pypeliner.managed.TempInputFile('mappability.tar.gz'),
            '-C',
            mappability_dir,
            '>',
            pypeliner.managed.OutputFile(map_extract_log),
        ),
    )

    for chromosome in config['chromosomes']:
        workflow.subworkflow(
            name='download_chromosome_{}'.format(chromosome),
            func=biowrappers.components.io.download.create_download_workflow,
            args=(
                config['chromosome_url_template'].format(chromosome),
                pypeliner.managed.TempOutputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
            ))

        workflow.commandline(
            name='extract_chromosome_{}'.format(chromosome),
            args=(
                'gunzip',
                '-c',
                pypeliner.managed.TempInputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
                '>',
                pypeliner.managed.OutputFile(
                    config['chromosome_template'].format(chromosome)),
            ),
        )

    return workflow
Esempio n. 3
0
def create_tophat_transcriptome_index_workflow(
        ref_genome_fasta_file,
        transcript_gtf_file,
        ref_genome_index_prefix,
        transcriptome_index_prefix,
        copy_ref_genome=False):

    workflow = Workflow()

    local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa'

    if copy_ref_genome:
        workflow.commandline(
            name='copy_genome',
            ctx={'local': True},
            args=(
                'cp',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    else:
        workflow.commandline(
            name='link_genome',
            ctx={'local': True},
            args=(
                'ln',
                '-s',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    workflow.transform(
        name='build_bowtie_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_genome_index,
        args=(
            mgd.InputFile(local_ref_genome_fasta_path),
            mgd.OutputFile(ref_genome_index_prefix),
        )
    )

    workflow.transform(
        name='build_tophat_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_transcriptome_index,
        args=(
            mgd.InputFile(ref_genome_index_prefix),
            mgd.InputFile(transcript_gtf_file),
            mgd.OutputFile(transcriptome_index_prefix),
        )
    )

    return workflow
Esempio n. 4
0
def create_snpeff_annotation_workflow(db,
                                      data_dir,
                                      target_vcf_file,
                                      out_file,
                                      base_docker={},
                                      snpeff_docker={},
                                      classic_mode=True,
                                      split_size=int(1e3),
                                      table_name='snpeff'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}

    if base_docker:
        ctx.update(base_docker)

    workflow = Workflow()

    workflow.transform(name='split_vcf',
                       ctx=dict(mem=2, **ctx),
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(target_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='run_snpeff',
        axes=('split', ),
        ctx=dict(mem=8, **ctx),
        func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff',
        args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('snpeff.vcf', 'split')),
        kwargs={
            'classic_mode': classic_mode,
            'docker_config': snpeff_docker
        })

    workflow.transform(
        name='convert_vcf_to_csv',
        axes=('split', ),
        ctx=dict(mem=4, **ctx),
        func=
        'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table',
        args=(mgd.TempInputFile('snpeff.vcf', 'split'),
              mgd.TempOutputFile('snpeff.csv.gz',
                                 'split',
                                 extensions=['.yaml']), table_name))

    workflow.transform(name='concatenate_tables',
                       ctx=dict(mem=4, **ctx),
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('snpeff.csv.gz', 'split'),
                             mgd.OutputFile(out_file, extensions=['.yaml'])))

    return workflow
Esempio n. 5
0
def create_battenberg_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    somatic_breakpoint_file=None,
    normal_id=None,
    **kwargs
):
    if normal_id is None:
        raise ValueError('cloneHD requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.subworkflow(
        name='run_battenberg',
        axes=('sample_id',),
        func=create_battenberg_single_workflow,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files),
            normal_id,
            pypeliner.managed.InputInstance('sample_id'),
            pypeliner.managed.OutputFile('results', 'sample_id', template=results_files),
            config,
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results', 'sample_id', template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Esempio n. 6
0
def create_samtools_germline_workflow(
        normal_bam_files,
        normal_bai_files,
        ref_genome_fasta_file,
        vcf_file,
        config,
        chromosomes=default_chromosomes,
        base_docker=None,
        samtools_docker=None,
        vcftools_docker=None
):

    ctx = {'mem': config["memory"]['low'],
           'pool_id': config['pools']['standard'],
           'mem_retry_increment': 2,
           'ncpus': 1}
    if base_docker:
        ctx.update(base_docker)

    regions = normal_bam_files.keys()

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        ctx=ctx,
        axes=('regions',),
        func="single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam', 'regions', fnames=normal_bam_files),
            pypeliner.managed.InputFile('normal.split.bam.bai', 'regions', fnames=normal_bai_files),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )
  
    workflow.transform(
        name='concatenate_variants',
        ctx=ctx,
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker}
    )

    return workflow
Esempio n. 7
0
def create_clonehd_single_workflow(
    normal_seqdata_file,
    tumour_seqdata_file,
    config,
    results_file,
    somatic_breakpoint_file=None,
    **kwargs
):
    workflow = Workflow()

    workflow.transform(
        name='prepare_data',
        ctx={'mem': 20},
        func=tasks.prepare_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile(tumour_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.cna.txt'),
            pypeliner.managed.TempOutputFile('tumour.cna.txt'),
            pypeliner.managed.TempOutputFile('tumour.baf.txt'),
            config,
        ),
    )

    workflow.transform(
        name='run_clonehd',
        ctx={'mem': 8},
        func=tasks.run_clonehd,
        args=(
            pypeliner.managed.TempInputFile('normal.cna.txt'),
            pypeliner.managed.TempInputFile('tumour.cna.txt'),
            pypeliner.managed.TempInputFile('tumour.baf.txt'),
            pypeliner.managed.TempOutputFile('tumour.summary.txt'),
            pypeliner.managed.TempOutputFile('cna_subclone', 'subclone'),
            pypeliner.managed.TempOutputFile('bam_subclone', 'subclone', axes_origin=[]),
            pypeliner.managed.TempSpace('run_clonehd_temp', cleanup=None),
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.transform(
        name='report',
        ctx={'mem': 4},
        func=tasks.report,
        args=(
            pypeliner.managed.TempInputFile('tumour.summary.txt'),
            pypeliner.managed.TempInputFile('cna_subclone', 'subclone'),
            pypeliner.managed.TempInputFile('bam_subclone', 'subclone'),
            pypeliner.managed.OutputFile(results_file),
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    return workflow
Esempio n. 8
0
def create_setup_remixt_workflow(config, databases, **kwargs):
    workflow = Workflow()

    ref_data_sentinal = os.path.join(kwargs['ref_data_dir'], 'sentinal')

    workflow.transform(
        name='remixt_create_ref_data',
        func=remixt.ref_data.create_ref_data,
        args=(
            config,
            kwargs['ref_data_dir'],
            pypeliner.managed.OutputFile(ref_data_sentinal),
        ),
    )

    workflow.subworkflow(
        name='remixt_create_bwa_mappability',
        func=remixt.mappability.bwa.workflow.create_bwa_mappability_workflow,
        args=(
            config,
            kwargs['ref_data_dir'],
        ),
        kwargs={
            'ref_data_sentinal':
            pypeliner.managed.InputFile(ref_data_sentinal),
        },
    )

    return workflow
Esempio n. 9
0
def create_ascat_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('ASCAT requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'),
            config,
        ),
    )

    return workflow
Esempio n. 10
0
def create_samtools_germline_workflow(normal_bam_files,
                                      ref_genome_fasta_file,
                                      vcf_file,
                                      config,
                                      samtools_docker=None,
                                      vcftools_docker=None):
    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem': config["memory"]['low'],
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'docker_image': baseimage
    }

    regions = list(normal_bam_files.keys())

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('regions'),
        value=regions,
    )

    workflow.transform(
        name='run_samtools_variant_calling',
        axes=('regions', ),
        func=
        "single_cell.workflows.germline.tasks.run_samtools_variant_calling",
        args=(
            pypeliner.managed.InputFile('normal.split.bam',
                                        'regions',
                                        fnames=normal_bam_files,
                                        extensions=['.bai']),
            ref_genome_fasta_file,
            pypeliner.managed.TempOutputFile('variants.vcf.gz', 'regions'),
        ),
        kwargs={
            'region': pypeliner.managed.InputInstance('regions'),
            'samtools_docker': samtools_docker,
            'vcftools_docker': samtools_docker
        },
    )

    workflow.transform(
        name='concatenate_variants',
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('variants.vcf.gz', 'regions'),
            pypeliner.managed.OutputFile(vcf_file, extensions=['.tbi']),
            pypeliner.managed.TempSpace("merge_variants_germline"),
        ),
        kwargs={'docker_config': vcftools_docker})

    return workflow
Esempio n. 11
0
def create_gc_wig_file(config, genome_file, out_file):
    workflow = Workflow()

    workflow.commandline(
        name='create_gc',
        ctx={'mem': 4},
        args=(
            'gcCounter',
            '-w',
            config['window_size'],
            pypeliner.managed.InputFile(genome_file),
            '>',
            pypeliner.managed.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 12
0
def main(args):
    config = cli.load_pypeliner_config(args)

    pyp = pypeliner.app.Pypeline([], config)

    workflow = Workflow()

    workflow.subworkflow(name='snpeff',
                         func=snpeff.create_snpeff_annotation_workflow,
                         args=(pypeliner.managed.InputFile(
                             args.target_vcf_file),
                               pypeliner.managed.TempOutputFile('snpeff.h5')),
                         kwargs={
                             'data_base': args.data_base,
                             'split_size': args.split_size,
                             'table_name': 'snpeff'
                         })

    workflow.transform(name='convert_to_tsv',
                       func=convert_hdf5_to_tsv,
                       ctx={'mem': 2},
                       args=(pypeliner.managed.TempInputFile('snpeff.h5'),
                             'snpeff',
                             pypeliner.managed.OutputFile(args.out_file)),
                       kwargs={
                           'compress': True,
                           'index': False
                       })

    pyp.run(workflow)
Esempio n. 13
0
def create_mappability_wig_file(config, out_file):
    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability_bigwig',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.OutputFile(out_file + '.bigwig'),
        ))

    workflow.commandline(
        name='convert_mappability_to_wig',
        ctx={'mem': 4},
        args=(
            'mapCounter',
            '-w',
            config['window_size'],
            pypeliner.managed.InputFile(out_file + '.bigwig'),
            '>',
            pypeliner.managed.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 14
0
def create_hla_type_workflow(
        normal_bam_file,
        hla_type_file):

    workflow = Workflow()

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools', 'view', '-bh', '-f', '2', '-F', '4',
            pypeliner.managed.InputFile(normal_bam_file),
            '6',
            '|',
            'samtools', 'collate', '-O', '-', pypeliner.managed.TempSpace('chr6_collate_temp'),
            '|',
            'samtools', 'bam2fq',
            '-1', pypeliner.managed.TempOutputFile('chr6_reads_1.fq'),
            '-2', pypeliner.managed.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(
        name='optitype',
        ctx={'mem': 24},
        func=tasks.run_optitype,
        args=(
            pypeliner.managed.TempInputFile('chr6_reads_1.fq'),
            pypeliner.managed.TempInputFile('chr6_reads_2.fq'),
            pypeliner.managed.OutputFile(hla_type_file),
            pypeliner.managed.TempSpace('optitype_temp'),
        )
    )

    return workflow
Esempio n. 15
0
def create_download_workflow(url, file_name):

    workflow = Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('url'), value=url)

    workflow.transform(name='download',
                       ctx={'local': True},
                       func=tasks.download_from_url,
                       args=(pypeliner.managed.TempInputObj('url'),
                             pypeliner.managed.OutputFile(file_name)))

    return workflow
Esempio n. 16
0
def create_battenberg_single_workflow(
    normal_seqdata_file,
    tumour_seqdata_file,
    normal_id,
    tumour_id,
    results_file,
    config,
    somatic_breakpoint_file=None,
    **kwargs
):
    workflow = Workflow()

    workflow.transform(
        name='prepare_data',
        ctx={'mem': 20},
        func=tasks.prepare_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile(tumour_seqdata_file),
            normal_id,
            tumour_id,
            pypeliner.managed.TempOutputFile('allele_counts.tar.gz'),
            pypeliner.managed.TempSpace('prepare_battenberg_temp', cleanup=None),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.transform(
        name='run_battenberg',
        ctx={'mem': 20},
        func=tasks.run_battenberg,
        args=(
            pypeliner.managed.TempInputFile('allele_counts.tar.gz'),
            normal_id,
            tumour_id,
            pypeliner.managed.OutputFile(results_file),
            pypeliner.managed.TempSpace('run_battenberg_temp', cleanup=None),
            config
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    return workflow
Esempio n. 17
0
def download_external_files(config):
    download_keys = [x for x in config if 'url' in config[x]]
    urls = dict(zip(
        download_keys,
        [config[x]['url'] for x in download_keys],
    ))
    downloaded_files = dict(
        zip(
            urls.keys(),
            [config[x]['local_path'] for x in urls.keys()],
        ))

    workflow = Workflow()
    workflow.setobj(
        obj=mgd.TempOutputObj('url', 'files'),
        value=urls,
    )
    workflow.subworkflow(
        name='download',
        func=create_download_workflow,
        axes=('files', ),
        args=(
            mgd.TempInputObj('url', 'files'),
            mgd.TempOutputFile('download.file', 'files'),
        ),
    )
    workflow.transform(
        name='unzip',
        axes=('files', ),
        func=tasks.unzip,
        args=(
            mgd.TempInputFile('download.file', 'files'),
            mgd.OutputFile('unzipped', 'files', fnames=downloaded_files),
        ),
    )
    return workflow
Esempio n. 18
0
def create_setup_titan_workflow(config, databases, **kwargs):
    workflow = Workflow()

    workflow.subworkflow(name='gc_wig',
                         func=create_gc_wig_file,
                         args=(
                             config,
                             pypeliner.managed.InputFile(
                                 databases['ref_genome']['local_path']),
                             pypeliner.managed.OutputFile(config['gc_wig']),
                         ))

    workflow.subworkflow(name='mappability_wig',
                         func=create_mappability_wig_file,
                         args=(
                             config,
                             pypeliner.managed.OutputFile(
                                 config['mappability_wig']),
                         ))

    return workflow
Esempio n. 19
0
def create_pvacseq_workflow(
    vcf_file,
    hla_type_file,
    results_file,
    config,
):
    workflow = Workflow()

    workflow.commandline(
        name='vep',
        ctx={'mem': 16},
        args=(
            'variant_effect_predictor.pl',
            '--input_file', pypeliner.managed.InputFile(vcf_file),
            '--format', 'vcf',
            '--output_file', pypeliner.managed.TempOutputFile('vep_annotated.vcf'),
            '--vcf', '--symbol', '--terms', 'SO',
            '--plugin', 'Downstream',
            '--plugin', 'Wildtype',
            '--cache', '--offline', '--force_overwrite',
            '--assembly', 'GRCh37',
            '--dir', config['vep_dir'],
            '--dir_plugins', os.path.join(config['vep_dir'], 'Plugins'),
        ),
    )

    workflow.transform(
        name='run_pvacseq',
        func=tasks.run_pvacseq,
        args=(
            pypeliner.managed.TempInputFile('vep_annotated.vcf'),
            pypeliner.managed.InputFile(hla_type_file),
            pypeliner.managed.OutputFile(results_file),
            pypeliner.managed.TempSpace('pvacseq_temp'),
            config,
        ),
    )

    return workflow
Esempio n. 20
0
def create_dbsnp_download_workflow(config, out_file):

    workflow = Workflow()

    workflow.subworkflow(
        name='download',
        func=download.create_download_workflow,
        args=(
            config['url'],
            pypeliner.managed.OutputFile(out_file)
        )
    )

    workflow.transform(
        name='index',
        ctx={'mem': 4},
        func=vcf_tasks.index_vcf,
        args=(
            pypeliner.managed.InputFile(out_file),
        )
    )

    return workflow
Esempio n. 21
0
def create_annotation_workflow(
    config,
    in_vcf_file,
    cosmic_status_file,
    dbsnp_status_file,
    mappability_file,
    snpeff_file,
    trinuc_file,
    variant_type='snv',
):

    annotators = ('cosmic_status', 'dbsnp_status', 'mappability', 'snpeff',
                  'tri_nucleotide_context')

    kwargs = {}

    for a in annotators:
        kwargs[a] = get_kwargs(config[a]['kwargs'],
                               '/{0}/{1}'.format(variant_type, a))

    workflow = Workflow()

    workflow.subworkflow(
        name='cosmic_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['cosmic']['local_path'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(cosmic_status_file)),
        kwargs=config["cosmic_status"]['kwargs'])

    workflow.subworkflow(
        name='dbsnp_status',
        func=
        'biowrappers.components.variant_calling.annotated_db_status.create_vcf_db_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['dbsnp']['local_path'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(dbsnp_status_file)),
        kwargs=config["dbsnp_status"]['kwargs'])

    workflow.subworkflow(
        name='mappability',
        func=
        'biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(
            config['databases']['mappability']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file, extensions=['.tbi']),
            pypeliner.managed.OutputFile(mappability_file),
        ),
        kwargs=config["mappability"]['kwargs'])

    workflow.subworkflow(
        name='snpeff',
        func=
        'biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(config['databases']['snpeff']['db'],
              config['databases']['snpeff']['data_dir'],
              pypeliner.managed.InputFile(in_vcf_file),
              pypeliner.managed.OutputFile(snpeff_file)),
        kwargs=kwargs['snpeff'])

    workflow.subworkflow(
        name='tri_nucleotide_context',
        func=
        'biowrappers.components.variant_calling.tri_nucleotide_context.create_vcf_tric_nucleotide_annotation_workflow',
        ctx=dict(mem=4, mem_retry_increment=2),
        args=(
            config['databases']['ref_genome']['local_path'],
            pypeliner.managed.InputFile(in_vcf_file),
            pypeliner.managed.OutputFile(trinuc_file),
        ),
        kwargs=config["tri_nucleotide_context"]['kwargs'])

    return workflow
Esempio n. 22
0
def call_and_annotate_pipeline(config,
                               normal_bam_path,
                               tumour_bam_paths,
                               raw_data_dir,
                               results_file,
                               chromosomes=default_chromosomes):

    workflow = Workflow()

    workflow.setobj(
        pypeliner.managed.OutputChunks('tumour_sample_id', axes_origin=[
            0,
        ]), tumour_bam_paths.keys())

    variant_files = get_variant_files(chromosomes, config, raw_data_dir)

    normal_bam_file = pypeliner.managed.File(normal_bam_path)

    tumour_bam_files = pypeliner.managed.File('tumour_bams',
                                              'tumour_sample_id',
                                              fnames=tumour_bam_paths)

    ref_genome_fasta_file = pypeliner.managed.File(
        config['databases']['ref_genome']['local_path'])

    #===================================================================================================================
    # Multi sample calling
    #===================================================================================================================
    if 'nuseq_multi_sample' in config:
        workflow.subworkflow(
            name='nuseq_multi_sample',
            axes=(),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(
                normal_bam_file.as_input(), [
                    pypeliner.managed.InputFile(x)
                    for x in tumour_bam_paths.values()
                ], ref_genome_fasta_file.as_input(),
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_output()),
            kwargs=config['nuseq_multi_sample']['kwargs'])

        workflow.transform(
            name='convert_nuseq_multi_sample_vcf_to_hdf5',
            axes=(),
            ctx=default_ctx,
            func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
            args=(
                variant_files['snv']['vcf']['nuseq_multi_sample'].as_input(),
                variant_files['snv']['hdf']['nuseq_multi_sample'].as_output(),
                '/snv/vcf/nuseq_multi_sample/all',
            ),
            kwargs={'score_callback': vcf_score_callbacks['snv']['nuseq']})

    #===================================================================================================================
    # Single sample calling
    #===================================================================================================================
    if 'nuseq' in config:
        workflow.subworkflow(
            name='nuseq',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.nuseq.create_nuseq_classify_workflow',
            args=(normal_bam_file.as_input(), [
                tumour_bam_files.as_input(),
            ], ref_genome_fasta_file.as_input(),
                  variant_files['snv']['vcf']['nuseq'].as_output()),
            kwargs=config['nuseq']['kwargs'])

    if 'mutect' in config:
        workflow.subworkflow(
            name='mutect',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.mutect.create_mutect_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  config['databases']['cosmic']['local_path'],
                  config['databases']['dbsnp']['local_path'],
                  variant_files['snv']['vcf']['mutect'].as_output()),
            kwargs=config['mutect']['kwargs'])

    if 'strelka' in config:
        workflow.subworkflow(
            name='strelka',
            axes=('tumour_sample_id', ),
            func=
            'biowrappers.components.variant_calling.strelka.create_strelka_workflow',
            args=(normal_bam_file.as_input(), tumour_bam_files.as_input(),
                  ref_genome_fasta_file.as_input(),
                  variant_files['indel']['vcf']['strelka'].as_output(),
                  variant_files['snv']['vcf']['strelka'].as_output()),
            kwargs=config['strelka']['kwargs'])

    #===================================================================================================================
    # Convert vcf to hdf5
    #===================================================================================================================
    for var_type in variant_files:
        for prog in variant_files[var_type]['vcf']:
            if prog == 'nuseq_multi_sample':
                continue

            workflow.transform(
                name='convert_{0}_indel_{1}_to_hdf5'.format(prog, var_type),
                axes=('tumour_sample_id', ),
                ctx=default_ctx,
                func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5",
                args=(variant_files[var_type]['vcf'][prog].as_input(),
                      variant_files[var_type]['hdf'][prog].as_output(),
                      pypeliner.managed.Template(
                          '/{var_type}/vcf/{prog}/{{tumour_sample_id}}'.format(
                              prog=prog, var_type=var_type),
                          'tumour_sample_id')),
                kwargs={'score_callback': vcf_score_callbacks[var_type][prog]})

    #===================================================================================================================
    # Indel annotation
    #===================================================================================================================
    workflow.transform(
        name='merge_indels',
        ctx=big_mem_ctx,
        func='biowrappers.components.io.vcf.tasks.vcf_tasks.merge_vcfs',
        args=([x.as_input() for x in variant_files['indel']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.indel.vcf')))

    workflow.transform(
        name='finalise_indels',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.indel.vcf'),
              pypeliner.managed.TempOutputFile('all.indel.vcf.gz')))

    workflow.subworkflow(
        name='annotate_indels',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.indel.vcf.gz'),
            pypeliner.managed.TempOutputFile('indel_annotations.h5'),
            os.path.join(raw_data_dir, 'indel'),
        ),
        kwargs={'variant_type': 'indel'})

    #===================================================================================================================
    # SNV
    #===================================================================================================================
    workflow.transform(
        name='merge_snvs',
        ctx=big_mem_ctx,
        func="biowrappers.components.io.vcf.tasks.merge_vcfs",
        args=([x.as_input() for x in variant_files['snv']['vcf'].values()],
              pypeliner.managed.TempOutputFile('all.snv.vcf')))

    workflow.transform(
        name='finalise_snvs',
        func="biowrappers.components.io.vcf.tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('all.snv.vcf'),
              pypeliner.managed.TempOutputFile('all.snv.vcf.gz')))

    workflow.subworkflow(
        name='annotate_snvs',
        axes=(),
        func=create_annotation_workflow,
        args=(
            config,
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.TempOutputFile('snv_annotations.h5'),
            os.path.join(raw_data_dir, 'snv'),
        ),
        kwargs={'variant_type': 'snv'})

    workflow.subworkflow(
        name='normal_snv_counts',
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            normal_bam_file.as_input(),
            pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        ),
        kwargs=get_kwargs(config['snv_counts']['kwargs'],
                          '/snv/counts/normal'))

    workflow.subworkflow(
        name='tumour_snv_counts',
        axes=('tumour_sample_id', ),
        func=
        'biowrappers.components.variant_calling.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(tumour_bam_files.as_input(),
              pypeliner.managed.TempInputFile('all.snv.vcf.gz'),
              pypeliner.managed.OutputFile(
                  os.path.join(raw_data_dir, 'snv', 'counts',
                               '{tumour_sample_id}.h5'), 'tumour_sample_id')),
        kwargs=get_kwargs(
            config['snv_counts']['kwargs'],
            pypeliner.managed.Template('/snv/counts/{tumour_sample_id}',
                                       'tumour_sample_id')))

    #===================================================================================================================
    # Create final output
    #===================================================================================================================
    tables = [
        pypeliner.managed.TempInputFile('indel_annotations.h5'),
        pypeliner.managed.TempInputFile('snv_annotations.h5'),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts', 'normal.h5')),
        pypeliner.managed.InputFile(
            os.path.join(raw_data_dir, 'snv', 'counts',
                         '{tumour_sample_id}.h5'), 'tumour_sample_id'),
    ]

    for var_type in variant_files:
        for prog in variant_files[var_type]['hdf']:
            tables.append(variant_files[var_type]['hdf'][prog].as_input())

    workflow.transform(
        name='build_results_file',
        ctx=default_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(tables, pypeliner.managed.OutputFile(results_file)),
        kwargs={
            'drop_duplicates': True,
        })

    return workflow
Esempio n. 23
0
def call_and_annotate_pipeline(
    config,
    bam_files,
    raw_data_dir,
    results_file,
    normal_id=None,
    somatic_breakpoint_file=None,
    patient_config=None,
):
    sample_ids = bam_files.keys()

    tumour_ids = bam_files.keys()
    if normal_id is not None:
        tumour_ids.remove(normal_id)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=sample_ids,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_id'),
        value=tumour_ids,
    )

    seq_data_template = os.path.join(raw_data_dir, 'seqdata',
                                     'sample_{sample_id}.h5')

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.subworkflow(
        name='extract_seqdata_workflow',
        axes=('sample_id', ),
        func=remixt.workflow.create_extract_seqdata_workflow,
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile('seqdata',
                                         'sample_id',
                                         template=seq_data_template),
            config['remixt'].get('extract_seqdata', {}),
            config['remixt']['ref_data_dir'],
        ),
    )

    merge_inputs = {}

    if 'remixt' in config:
        remixt_raw_data = os.path.join(raw_data_dir, 'remixt')
        remixt_results_filename = os.path.join(remixt_raw_data, 'results.h5')
        make_parent_directory(remixt_results_filename)

        remixt_config = config['remixt']['config']
        assert 'sample_specific' not in remixt_config
        remixt_config.update(patient_config)

        workflow.subworkflow(
            name='remixt',
            func=biowrappers.components.copy_number_calling.remixt.
            create_remixt_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                remixt_config,
                pypeliner.managed.OutputFile(remixt_results_filename),
                remixt_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'ref_data_dir': config['remixt']['ref_data_dir'],
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/remixt'] = pypeliner.managed.InputFile(
            remixt_results_filename)

    if 'titan' in config:
        titan_raw_data = os.path.join(raw_data_dir, 'titan')
        titan_results_filename = os.path.join(titan_raw_data, 'results.h5')
        make_parent_directory(titan_results_filename)

        workflow.subworkflow(
            name='titan',
            func=biowrappers.components.copy_number_calling.titan.
            create_titan_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['titan']['config'],
                pypeliner.managed.OutputFile(titan_results_filename),
                titan_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/titan'] = pypeliner.managed.InputFile(
            titan_results_filename)

    if 'clonehd' in config:
        clonehd_raw_data = os.path.join(raw_data_dir, 'clonehd')
        clonehd_results_filename = os.path.join(clonehd_raw_data, 'results.h5')
        make_parent_directory(clonehd_results_filename)

        workflow.subworkflow(
            name='clonehd',
            func=biowrappers.components.copy_number_calling.clonehd.
            create_clonehd_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['clonehd']['config'],
                pypeliner.managed.OutputFile(clonehd_results_filename),
                clonehd_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/clonehd'] = pypeliner.managed.InputFile(
            clonehd_results_filename)

    if 'theta' in config:
        theta_raw_data = os.path.join(raw_data_dir, 'theta')
        theta_results_filename = os.path.join(theta_raw_data, 'results.h5')
        make_parent_directory(theta_results_filename)

        workflow.subworkflow(
            name='theta',
            func=biowrappers.components.copy_number_calling.theta.
            create_theta_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['theta']['config'],
                pypeliner.managed.OutputFile(theta_results_filename),
                theta_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
                'num_clones': config['theta']['kwargs']['num_clones'],
            },
        )

        merge_inputs['/copy_number/theta'] = pypeliner.managed.InputFile(
            theta_results_filename)

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            merge_inputs,
            pypeliner.managed.OutputFile(results_file),
        ),
    )

    return workflow
Esempio n. 24
0
def create_theta_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Theta requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_template = os.path.join(raw_data_dir, 'results',
                                    'sample_{sample_id}.h5')
    bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2',
                                        'bicseq2_{sample_id}.seg')
    utils.make_parent_directory(results_template)
    utils.make_parent_directory(bicseq2_seg_template)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='run_bicseq2',
        axes=('sample_id', ),
        ctx={'mem': 30},
        func=tasks.run_bicseq2_seg,
        args=(
            pypeliner.managed.OutputFile('bicseq2_seg',
                                         'sample_id',
                                         template=bicseq2_seg_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            config,
            pypeliner.managed.TempSpace('bicseq2_work',
                                        'sample_id',
                                        cleanup=None),
        ),
    )

    workflow.transform(
        name='run_theta',
        axes=('sample_id', ),
        ctx={'mem': 32},
        func=tasks.run_theta,
        args=(
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.InputFile('bicseq2_seg',
                                        'sample_id',
                                        template=bicseq2_seg_template),
            config,
            pypeliner.managed.TempSpace('theta_work',
                                        'sample_id',
                                        cleanup=None),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
            'num_clones': kwargs.get('num_clones', None),
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_template),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Esempio n. 25
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            indel_vcf_file,
                            snv_vcf_file,
                            config,
                            chromosomes=default_chromosomes,
                            split_size=int(1e7),
                            use_depth_thresholds=True):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'num_retry': 3,
        'docker_image': config['docker']['single_cell_pipeline']
    }

    strelka_docker = {'docker_image': config['docker']['strelka']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    regions = list(normal_bam_file.keys())
    assert set(tumour_bam_file.keys()) == set(regions)

    workflow = Workflow(ctx=ctx)

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('chrom'),
        value=chromosomes,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='count_fasta_bases',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.count_fasta_bases",
        args=(ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
              strelka_docker))

    workflow.transform(
        name="get_chrom_sizes",
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    workflow.transform(
        name='call_somatic_variants',
        ctx=dict(mem=4, disk=40),
        func="single_cell.workflows.strelka.tasks.call_somatic_variants",
        axes=('region', ),
        args=(pypeliner.managed.InputFile("normal.split.bam",
                                          "region",
                                          fnames=normal_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.InputFile("merged_bam",
                                          "region",
                                          fnames=tumour_bam_file,
                                          extensions=['.bai']),
              pypeliner.managed.TempInputObj('known_sizes'),
              ref_genome_fasta_file,
              pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf',
                                               'region'),
              pypeliner.managed.TempOutputFile('strelka.stats', 'region'),
              pypeliner.managed.InputInstance("region"), strelka_docker),
    )

    workflow.transform(
        name='add_indel_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_indel_file_list",
        args=(pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf',
                                              'region'),
              pypeliner.managed.TempInputFile('strelka.stats', 'region'),
              pypeliner.managed.TempInputFile(
                  'somatic.indels.unfiltered.vcf.window', 'region'),
              pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf',
                                               'chrom'),
              pypeliner.managed.InputInstance("chrom"),
              pypeliner.managed.TempInputObj('known_sizes'), regions),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='add_snv_filters',
        axes=('chrom', ),
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.tasks.filter_snv_file_list",
        args=(
            pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf',
                                            'region'),
            pypeliner.managed.TempInputFile('strelka.stats', 'region'),
            pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf',
                                             'chrom'),
            pypeliner.managed.InputInstance("chrom"),
            pypeliner.managed.TempInputObj('known_sizes'),
            regions,
        ),
        kwargs={'use_depth_filter': use_depth_thresholds})

    workflow.transform(
        name='merge_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf',
                                            'chrom'),
            pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempSpace("merge_indels_temp"), vcftools_docker))

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf',
                                              'chrom'),
              pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempSpace("merge_snvs_temp"), vcftools_docker))

    workflow.transform(
        name='filter_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(
            pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'),
            pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf')))

    workflow.transform(
        name='filter_snvs',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.filter_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'),
              pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf')))

    workflow.transform(
        name='finalise_indels',
        ctx=dict(mem=4),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'),
              pypeliner.managed.OutputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    workflow.transform(
        name='finalise_snvs',
        ctx=dict(mem=2),
        func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf",
        args=(pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'),
              pypeliner.managed.OutputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
              vcftools_docker))

    return workflow
Esempio n. 26
0
def call_and_annotate_pipeline(
    config,
    normal_bam_path,
    tumour_bam_paths,
    raw_data_dir,
    results_file,
):
    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_sample_id'),
        value=tumour_bam_paths.keys(),
    )

    merge_inputs = {}

    if 'destruct' in config:
        destruct_raw_data = os.path.join(raw_data_dir, 'destruct')
        destruct_results_filename = os.path.join(destruct_raw_data,
                                                 'results.h5')
        make_parent_directory(destruct_results_filename)

        workflow.subworkflow(
            name='destruct',
            func=destruct.destruct_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
                pypeliner.managed.OutputFile(destruct_results_filename),
                destruct_raw_data,
            ),
        )

        merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile(
            destruct_results_filename)

    if 'delly' in config:
        delly_raw_data = os.path.join(raw_data_dir, 'delly')
        delly_results_filename = os.path.join(delly_raw_data, 'results.h5')
        make_parent_directory(delly_results_filename)

        workflow.subworkflow(
            name='delly',
            func=delly.delly_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['delly']['ref_genome_fasta_file'],
                config['delly']['exclude_file'],
                pypeliner.managed.OutputFile(delly_results_filename),
                delly_raw_data,
            ),
        )

        merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile(
            delly_results_filename)

    if 'lumpysv' in config:
        lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv')
        lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5')
        make_parent_directory(lumpysv_results_filename)

        workflow.subworkflow(
            name='lumpysv',
            func=lumpysv.lumpysv_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                pypeliner.managed.OutputFile(lumpysv_results_filename),
                lumpysv_raw_data,
            ),
        )

        merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile(
            lumpysv_results_filename)

    workflow.transform(name='merge_results',
                       ctx={'mem': 8},
                       func=hdf5_tasks.merge_hdf5,
                       args=(
                           merge_inputs,
                           pypeliner.managed.OutputFile(results_file),
                       ))

    return workflow
Esempio n. 27
0
def create_strelka_workflow(normal_bam_file,
                            tumour_bam_file,
                            snv_vcf_file,
                            snv_maf_file,
                            indel_vcf_file,
                            indel_maf_file,
                            reference,
                            reference_vep,
                            chromosomes,
                            normal_id,
                            tumour_id,
                            single_node=False,
                            is_exome=False):
    params = config.default_params('variant_calling')

    workflow = Workflow(ctx=helpers.get_default_ctx(memory=5,
                                                    walltime='4:00'), )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ret=mgd.OutputChunks('regions'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    workflow.transform(
        name='count_fasta_bases',
        func="wgs.workflows.strelka.tasks.count_fasta_bases",
        args=(
            reference,
            pypeliner.managed.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name="get_chrom_sizes",
        func="wgs.workflows.strelka.tasks.get_known_chromosome_sizes",
        ret=pypeliner.managed.TempOutputObj('known_sizes'),
        args=(pypeliner.managed.TempInputFile('ref_base_counts.tsv'),
              chromosomes))

    if single_node:
        workflow.transform(name='strelka_one_node',
                           func="wgs.workflows.strelka.tasks.strelka_one_node",
                           args=(
                               pypeliner.managed.InputFile(normal_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               pypeliner.managed.InputFile(tumour_bam_file,
                                                           extensions=['.bai'
                                                                       ]),
                               reference,
                               mgd.TempOutputFile('indels.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempOutputFile('snvs.vcf.gz',
                                                  extensions=['.tbi', '.csi']),
                               mgd.TempSpace('call_genome_segment_tmp'),
                               mgd.InputChunks('regions'),
                               mgd.TempInputObj('known_sizes'),
                           ),
                           kwargs={
                               'is_exome': is_exome,
                           })
    else:
        workflow.transform(
            name='get_chromosome_depths',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.get_chromosome_depth",
            args=(
                mgd.InputInstance('regions'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('chrom_depth.txt', 'regions'),
            ),
        )

        workflow.transform(
            name='merge_chromosome_depths',
            func="wgs.workflows.strelka.tasks.merge_chromosome_depths",
            args=(mgd.TempInputFile('chrom_depth.txt',
                                    'regions',
                                    axes_origin=[]),
                  mgd.TempOutputFile('merged_chrom_depth.txt')))

        workflow.transform(
            name='call_genome_segment',
            axes=('regions', ),
            func="wgs.workflows.strelka.tasks.call_genome_segment",
            args=(
                mgd.TempInputFile('merged_chrom_depth.txt'),
                pypeliner.managed.InputFile(normal_bam_file,
                                            extensions=['.bai']),
                pypeliner.managed.InputFile(tumour_bam_file,
                                            extensions=['.bai']),
                reference,
                mgd.TempOutputFile('indels.vcf', 'regions'),
                mgd.TempOutputFile('snvs.vcf', 'regions'),
                mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                mgd.InputInstance('regions'),
                mgd.TempInputObj('known_sizes'),
            ),
            kwargs={
                'is_exome': False,
            })

        workflow.transform(
            name='merge_indels',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('indels.vcf', 'regions'),
                  mgd.TempOutputFile('indels.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("indels_merge")),
        )

        workflow.transform(
            name='merge_snvs',
            func='wgs.workflows.strelka.tasks.concatenate_vcf',
            args=(mgd.TempInputFile('snvs.vcf', 'regions'),
                  mgd.TempOutputFile('snvs.vcf.gz',
                                     extensions=['.tbi', '.csi']),
                  mgd.TempSpace("snvs_merge")),
        )

    workflow.transform(name='bcftools_normalize_snv',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('snvs.vcf.gz'),
                           mgd.TempOutputFile('normalized_snvs.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs.vcf'),
            mgd.TempOutputFile('normalized_snvs_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(name='bcftools_normalize_indel',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('indels.vcf.gz'),
                           mgd.TempOutputFile('normalized_indels.vcf'),
                           reference,
                       ))
    workflow.transform(
        name='finalise_normalize_indel',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized_indels.vcf'),
            mgd.TempOutputFile('normalized_indels_finalize.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_indel',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_indels_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(indel_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.transform(
        name='filter_vcf_snv',
        func='wgs.workflows.strelka.tasks.filter_vcf',
        args=(
            mgd.TempInputFile('normalized_snvs_finalize.vcf.gz',
                              extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf_file, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_snv_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(indel_vcf_file,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(indel_maf_file),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Esempio n. 28
0
def build_indexes(config):
    workflow = Workflow()
    if 'ref_genome_fasta_file' in config:
        workflow.transform(
            name='index_ref_genome',
            func=biowrappers.components.ngs.samtools.tasks.faidx,
            args=(
                mgd.InputFile(config['ref_genome_fasta_file']['local_path']),
                mgd.OutputFile(config['ref_genome_fasta_file']['local_path'] +
                               '.fai'),
            ),
        )
    if 'transcriptome_fasta_file' in config:
        workflow.transform(
            name='index_transcriptome',
            func=biowrappers.components.ngs.samtools.tasks.faidx,
            args=(
                mgd.InputFile(
                    config['transcriptome_fasta_file']['local_path']),
                mgd.OutputFile(
                    config['transcriptome_fasta_file']['local_path'] + '.fai'),
            ),
        )
    if 'kallisto' in config:
        workflow.transform(
            name='build_kallisto_index',
            func=biowrappers.components.rna.kallisto.tasks.build_index,
            ctx={
                'mem': 32,
                'num_retry': 3,
                'mem_retry_increment': 8
            },
            args=(
                mgd.OutputFile(config['kallisto']['index']),
                mgd.InputFile(
                    config['transcriptome_fasta_file']['local_path']),
            ),
            kwargs={'kmer_length': config['kallisto']['kmer_length']})
    if 'salmon' in config:
        workflow.transform(
            name='build_salmon_index',
            func=biowrappers.components.rna.salmon.tasks.build_index,
            ctx={
                'mem': 32,
                'num_retry': 3,
                'mem_retry_increment': 8
            },
            args=(
                mgd.OutputFile(
                    os.path.join(config['salmon']['index'], 'index.finished')),
                mgd.InputFile(
                    config['transcriptome_fasta_file']['local_path']),
            ),
            kwargs={
                'kmer_length': config['salmon']['kmer_length'],
                'gencode': config['salmon'].get('gencode', False),
            })
    if 'star' in config:
        workflow.transform(
            name='build_star_index',
            func=biowrappers.components.rna.star.tasks.build_index,
            ctx={
                'mem': 32,
                'num_retry': 3,
                'mem_retry_increment': 8,
                'local': config['star'].get('local', False)
            },
            args=(
                mgd.OutputFile(
                    os.path.join(config['star']['index'], 'index.finished')),
                mgd.InputFile(config['ref_genome_fasta_file']['local_path']),
                mgd.InputFile(
                    config['gene_annotation_gtf_file']['local_path']),
            ),
            kwargs={
                'overhang': config['star']['overhang'],
                'num_threads': config['star'].get('num_threads', 1),
            })
    if 'tophat' in config:
        workflow.subworkflow(
            name='build_tophat_index',
            func=biowrappers.components.rna.tophat.workflow.
            create_tophat_transcriptome_index_workflow,
            args=(
                mgd.InputFile(config['ref_genome_fasta_file']['local_path']),
                mgd.InputFile(
                    config['gene_annotation_gtf_file']['local_path']),
                mgd.OutputFile(config['tophat']['ref_genome_index']),
                mgd.OutputFile(config['tophat']['transcriptome_index']),
            ),
        )
    return workflow
Esempio n. 29
0
def destruct_pipeline(
    normal_bam_file,
    tumour_bam_files,
    config,
    ref_data_dir,
    out_file,
    raw_data_dir,
    normal_sample_id='normal',
):
    bam_files = tumour_bam_files
    bam_files[normal_sample_id] = normal_bam_file

    utils.make_directory(os.path.join(raw_data_dir, 'raw'))
    breakpoint_file = os.path.join(raw_data_dir, 'raw', 'breakpoint.tsv')
    breakpoint_library_file = os.path.join(raw_data_dir, 'raw',
                                           'breakpoint_library.tsv')
    breakpoint_read_file = os.path.join(raw_data_dir, 'raw',
                                        'breakpoint_read.tsv')

    utils.make_directory(os.path.join(raw_data_dir, 'somatic'))
    somatic_breakpoint_file = os.path.join(raw_data_dir, 'somatic',
                                           'breakpoint.tsv')
    somatic_breakpoint_library_file = os.path.join(raw_data_dir, 'somatic',
                                                   'breakpoint_library.tsv')

    raw_read_data_dir = os.path.join(raw_data_dir, 'read_data')
    utils.make_directory(raw_read_data_dir)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='run_destruct',
        func="destruct.workflow.create_destruct_workflow",
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile(breakpoint_file),
            pypeliner.managed.OutputFile(breakpoint_library_file),
            pypeliner.managed.OutputFile(breakpoint_read_file),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_read_data_dir,
        },
    )

    workflow.transform(
        name='filter_annotate_breakpoints',
        ctx={'mem': 8},
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints',
        args=(
            pypeliner.managed.InputFile(breakpoint_file),
            pypeliner.managed.InputFile(breakpoint_library_file),
            [normal_sample_id],
            pypeliner.managed.OutputFile(somatic_breakpoint_file),
            pypeliner.managed.OutputFile(somatic_breakpoint_library_file),
        ),
    )

    workflow.transform(
        name='write_store',
        func=
        'biowrappers.components.breakpoint_calling.destruct.tasks.write_store',
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            pypeliner.managed.InputFile(somatic_breakpoint_file),
            pypeliner.managed.InputFile(somatic_breakpoint_library_file),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 30
0
def realignment_pipeline(
        config,
        in_file,
        out_file,
        read_group_info=None):

    if read_group_info is None:
        read_group_info = config.get('read_group', {})

    if 'ID' not in read_group_info:
        read_group_info['ID'] = hash(in_file) % int(1e6)

    ref_genome = pypeliner.managed.InputFile(config['ref_genome']['file'])

    read_1 = pypeliner.managed.TempFile('read_1', 'split')

    read_2 = pypeliner.managed.TempFile('read_2', 'split')

    read_1_sai = pypeliner.managed.TempFile('read_1.sai', 'split')

    read_2_sai = pypeliner.managed.TempFile('read_2.sai', 'split')

    read_group_config = pypeliner.managed.TempObj('read_group_config')

    workflow = Workflow()

    if 'read_group' in config:
        workflow.setobj(
            obj=read_group_config.as_output(),
            value=read_group_info,
        )

    else:
        workflow.transform(
            name='get_read_group_config',
            ctx={'local': True},
            func=tasks.get_read_group_config,
            ret=read_group_config.as_output(),
            args=(
                pypeliner.managed.InputFile(in_file),
            )
        )

    workflow.transform(
        name='bam_to_fasta',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.convert_to_fastqs,
        args=(
            pypeliner.managed.InputFile(in_file),
            {
                1: read_1.as_output(),
                2: read_2.as_output(),
            },
            pypeliner.managed.TempSpace('bam_to_fastq'),
        ),
        kwargs={
            'split_size': config['split_size']
        },
    )

    workflow.transform(
        name='aln_read_1',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_1.as_input(),
            ref_genome,
            read_1_sai.as_output(),
        ),
    )

    workflow.transform(
        name='aln_read_2',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_aln,
        args=(
            read_2.as_input(),
            ref_genome,
            read_2_sai.as_output(),
        ),
    )

    workflow.transform(
        name='sampe',
        axes=('split',),
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bwa_tasks.run_sampe,
        args=(
            read_1.as_input(),
            read_2.as_input(),
            read_1_sai.as_input(),
            read_2_sai.as_input(),
            ref_genome,
            pypeliner.managed.TempOutputFile('aligned.bam', 'split'),
        ),
        kwargs={
            'read_group_info': read_group_config.as_input()
        },
    )

    workflow.transform(
        name='sort',
        axes=('split',),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.sort,
        args=(
            pypeliner.managed.TempInputFile('aligned.bam', 'split'),
            pypeliner.managed.TempOutputFile('sorted.bam', 'split'),
        ),
    )

    workflow.transform(
        name='write_header_file',
        axes=(),
        ctx={'local': True},
        func=tasks.write_header_file,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.TempOutputFile('header.sam'),
            config['ref_genome']['header']
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        func=bam_tasks.merge,
        args=(
            pypeliner.managed.TempInputFile('sorted.bam', 'split'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'header_file': pypeliner.managed.TempInputFile('header.sam'),
        },
    )

    return workflow