Example #1
0
def create_hla_type_workflow(
        normal_bam_file,
        hla_type_file):

    workflow = Workflow()

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools', 'view', '-bh', '-f', '2', '-F', '4',
            pypeliner.managed.InputFile(normal_bam_file),
            '6',
            '|',
            'samtools', 'collate', '-O', '-', pypeliner.managed.TempSpace('chr6_collate_temp'),
            '|',
            'samtools', 'bam2fq',
            '-1', pypeliner.managed.TempOutputFile('chr6_reads_1.fq'),
            '-2', pypeliner.managed.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(
        name='optitype',
        ctx={'mem': 24},
        func=tasks.run_optitype,
        args=(
            pypeliner.managed.TempInputFile('chr6_reads_1.fq'),
            pypeliner.managed.TempInputFile('chr6_reads_2.fq'),
            pypeliner.managed.OutputFile(hla_type_file),
            pypeliner.managed.TempSpace('optitype_temp'),
        )
    )

    return workflow
Example #2
0
def create_mappability_wig_file(config, out_file):
    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability_bigwig',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.OutputFile(out_file + '.bigwig'),
        ))

    workflow.commandline(
        name='convert_mappability_to_wig',
        ctx={'mem': 4},
        args=(
            'mapCounter',
            '-w',
            config['window_size'],
            pypeliner.managed.InputFile(out_file + '.bigwig'),
            '>',
            pypeliner.managed.OutputFile(out_file),
        ),
    )

    return workflow
Example #3
0
def create_setup_theta_workflow(config, databases, **kwargs):
    mappability_dir = os.path.realpath(
        os.path.join(os.path.dirname(config['mappability_template']),
                     os.pardir))
    map_extract_log = os.path.join(mappability_dir, 'mappability_extract.log')
    chromosomes_dir = os.path.dirname(config['chromosome_template'])

    utils.make_directory(mappability_dir)
    utils.make_directory(chromosomes_dir)

    workflow = Workflow()

    workflow.subworkflow(
        name='download_mappability',
        func=biowrappers.components.io.download.create_download_workflow,
        args=(
            config['mappability_url'],
            pypeliner.managed.TempOutputFile('mappability.tar.gz'),
        ))

    workflow.commandline(
        name='extract_mappability',
        args=(
            'tar',
            '-xzvf',
            pypeliner.managed.TempInputFile('mappability.tar.gz'),
            '-C',
            mappability_dir,
            '>',
            pypeliner.managed.OutputFile(map_extract_log),
        ),
    )

    for chromosome in config['chromosomes']:
        workflow.subworkflow(
            name='download_chromosome_{}'.format(chromosome),
            func=biowrappers.components.io.download.create_download_workflow,
            args=(
                config['chromosome_url_template'].format(chromosome),
                pypeliner.managed.TempOutputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
            ))

        workflow.commandline(
            name='extract_chromosome_{}'.format(chromosome),
            args=(
                'gunzip',
                '-c',
                pypeliner.managed.TempInputFile(
                    'chromosome_{}.fa.gz'.format(chromosome)),
                '>',
                pypeliner.managed.OutputFile(
                    config['chromosome_template'].format(chromosome)),
            ),
        )

    return workflow
Example #4
0
def realignment_readgroups_pipeline(
        config,
        in_file,
        out_file):

    workflow = Workflow()

    workflow.transform(
        name='get_read_group_configs',
        func=tasks.get_read_group_configs,
        ret=pypeliner.managed.TempOutputObj('read_group_config', 'read_group_id'),
        args=(
            pypeliner.managed.InputFile(in_file),
        )
    )

    workflow.commandline(
        name='create_read_group_bam',
        axes=('read_group_id',),
        args=(
            'samtools', 'view', '-b',
            '-r', pypeliner.managed.InputInstance('read_group_id'),
            pypeliner.managed.InputFile(in_file),
            '>',
            pypeliner.managed.TempOutputFile('read_group_bam', 'read_group_id'),
        )
    )

    workflow.subworkflow(
        name='realignment_pipeline',
        axes=('read_group_id',),
        func=realignment_pipeline,
        args=(
            config,
            pypeliner.managed.TempInputFile('read_group_bam', 'read_group_id'),
            pypeliner.managed.TempOutputFile('realigned_read_group_bam', 'read_group_id'),
        ),
        kwargs={
            'read_group_info': pypeliner.managed.TempInputObj('read_group_config', 'read_group_id'),
        }
    )

    workflow.transform(
        name='merge_and_markdups',
        axes=('read_group_id',),
        ctx={'mem' : 48, 'num_retry' : 3, 'mem_retry_increment' : 16},
        func=bam_tasks.mark_duplicates,
        args=(
            pypeliner.managed.TempInputFile('realigned_read_group_bam', 'read_group_id'),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'tmp_dir' : pypeliner.managed.TempSpace('markdup_temp', 'read_group_id')
        }
    )

    return workflow
Example #5
0
def create_tophat_transcriptome_index_workflow(
        ref_genome_fasta_file,
        transcript_gtf_file,
        ref_genome_index_prefix,
        transcriptome_index_prefix,
        copy_ref_genome=False):

    workflow = Workflow()

    local_ref_genome_fasta_path = ref_genome_index_prefix + '.fa'

    if copy_ref_genome:
        workflow.commandline(
            name='copy_genome',
            ctx={'local': True},
            args=(
                'cp',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    else:
        workflow.commandline(
            name='link_genome',
            ctx={'local': True},
            args=(
                'ln',
                '-s',
                mgd.InputFile(ref_genome_fasta_file),
                mgd.OutputFile(local_ref_genome_fasta_path),
            ),
        )

    workflow.transform(
        name='build_bowtie_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_genome_index,
        args=(
            mgd.InputFile(local_ref_genome_fasta_path),
            mgd.OutputFile(ref_genome_index_prefix),
        )
    )

    workflow.transform(
        name='build_tophat_index',
        ctx={'mem': 8, 'num_retry': 3, 'mem_retry_increment': 8},
        func=tasks.build_transcriptome_index,
        args=(
            mgd.InputFile(ref_genome_index_prefix),
            mgd.InputFile(transcript_gtf_file),
            mgd.OutputFile(transcriptome_index_prefix),
        )
    )

    return workflow
Example #6
0
def create_gc_wig_file(config, genome_file, out_file):
    workflow = Workflow()

    workflow.commandline(
        name='create_gc',
        ctx={'mem': 4},
        args=(
            'gcCounter',
            '-w',
            config['window_size'],
            pypeliner.managed.InputFile(genome_file),
            '>',
            pypeliner.managed.OutputFile(out_file),
        ),
    )

    return workflow
Example #7
0
def create_pvacseq_workflow(
    vcf_file,
    hla_type_file,
    results_file,
    config,
):
    workflow = Workflow()

    workflow.commandline(
        name='vep',
        ctx={'mem': 16},
        args=(
            'variant_effect_predictor.pl',
            '--input_file', pypeliner.managed.InputFile(vcf_file),
            '--format', 'vcf',
            '--output_file', pypeliner.managed.TempOutputFile('vep_annotated.vcf'),
            '--vcf', '--symbol', '--terms', 'SO',
            '--plugin', 'Downstream',
            '--plugin', 'Wildtype',
            '--cache', '--offline', '--force_overwrite',
            '--assembly', 'GRCh37',
            '--dir', config['vep_dir'],
            '--dir_plugins', os.path.join(config['vep_dir'], 'Plugins'),
        ),
    )

    workflow.transform(
        name='run_pvacseq',
        func=tasks.run_pvacseq,
        args=(
            pypeliner.managed.TempInputFile('vep_annotated.vcf'),
            pypeliner.managed.InputFile(hla_type_file),
            pypeliner.managed.OutputFile(results_file),
            pypeliner.managed.TempSpace('pvacseq_temp'),
            config,
        ),
    )

    return workflow
Example #8
0
def create_titan_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Titan requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.wig'),
            pypeliner.managed.TempOutputFile('het_positions.tsv'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempInputFile('het_positions.tsv'),
            pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv',
                                             'sample_id'),
            config,
        ),
    )

    workflow.transform(
        name='create_intialization_parameters',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.create_intialization_parameters,
        ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id',
                                            'init_param_id'),
        args=(config, ),
    )

    workflow.transform(
        name='run_titan',
        axes=('sample_id', 'init_param_id'),
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.run_titan,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('normal.wig'),
            pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'),
            pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id',
                                             'init_param_id'),
            pypeliner.managed.TempOutputFile('params.tsv', 'sample_id',
                                             'init_param_id'),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='select_solution',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.select_solution,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('cn.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.TempInputFile('params.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_files),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_segments.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            config,
            pypeliner.managed.Template('{sample_id}', 'sample_id'),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
        },
    )

    workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id',
                                                       'chromosome'),
                    value=config.get('chromosomes', default_chromosomes),
                    axes=('sample_id', ))

    workflow.commandline(
        name='plot_chromosome',
        axes=('sample_id', 'chromosome'),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            'plot_titan_chromosome.R',
            pypeliner.managed.Instance('chromosome'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_chr_{chromosome}.png'), 'sample_id',
                'chromosome'),
        ),
    )

    workflow.transform(
        name='merge_results',
        ctx={
            'mem': 8,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow
Example #9
0
def create_ref_genome_download_and_index_workflow(config, out_file):

    workflow = Workflow()

    if config['url'].endswith('gz'):
        workflow.subworkflow(
            name='download',
            func=download.create_download_workflow,
            args=(
                config['url'],
                pypeliner.managed.TempOutputFile('ref.fasta.gz'),
            )
        )

        workflow.commandline(
            name='gunzip',
            args=(
                'gzip', '-cd',
                pypeliner.managed.TempInputFile('ref.fasta.gz'),
                '>',
                pypeliner.managed.OutputFile(out_file)
            ),
        )

    else:
        workflow.subworkflow(
            name='download',
            func=download.create_download_workflow,
            args=(
                config['url'],
                pypeliner.managed.OutputFile(out_file)
            )
        )

    workflow.commandline(
        name='build_dict',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'samtools',
            'dict',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_dict.log'),
        )
    )

    workflow.commandline(
        name='build_fai',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'samtools',
            'faidx',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_fai.log'),
        )
    )

    workflow.commandline(
        name='build_bwa_index',
        ctx={'mem': 6, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            'bwa',
            'index',
            pypeliner.managed.InputFile(out_file),
            '>',
            pypeliner.managed.OutputFile(out_file + '.build_bwa_index.log'),
        )
    )

    return workflow
Example #10
0
def create_setup_reference_dbs_workflow(config):

    workflow = Workflow()

    if 'cosmic' in config:
        workflow.transform(
            name='cosmic',
            func=tasks.download_cosmic,
            args=(
                config['cosmic'],
                pypeliner.managed.OutputFile(config['cosmic']['local_path']),
                pypeliner.managed.TempSpace('cosmic_work', cleanup=None)
            )
        )

    if 'dbsnp' in config:
        workflow.subworkflow(
            name='dbsnp',
            func=create_dbsnp_download_workflow,
            args=(
                config['dbsnp'],
                pypeliner.managed.OutputFile(config['dbsnp']['local_path']),
            )
        )

    if 'mappability' in config:
        workflow.subworkflow(
            name='mappability',
            func=download.create_download_workflow,
            args=(
                config['mappability']['url'],
                pypeliner.managed.OutputFile(config['mappability']['local_path']),
            )
        )

    if 'ref_genome' in config and 'url' in config['ref_genome']:
        workflow.subworkflow(
            name='ref_genome',
            func=create_ref_genome_download_and_index_workflow,
            args=(
                config['ref_genome'],
                pypeliner.managed.OutputFile(config['ref_genome']['local_path']),
            )
        )

    if 'snpeff' in config:
        workflow.commandline(
            name='snpeff',
            args=(
                'snpEff',
                'download',
                config['snpeff']['db']
            )
        )

    if 'chrom_info' in config:
        workflow.subworkflow(
            name='chrom_info',
            func=download.create_download_workflow,
            args=(
                config['chrom_info']['url'],
                pypeliner.managed.OutputFile(config['chrom_info']['local_path']),
            )
        )

    return workflow