Beispiel #1
0
def _create_download_decompress_concat_workflow(urls,
                                                out_file,
                                                local_download=False):
    workflow = pypeliner.workflow.Workflow()

    local_files = []

    for i, url in enumerate(urls):
        local_files.append(mgd.TempFile('file_{}'.format(i)))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.subworkflow(name='download_file_{}'.format(i),
                             func=_create_download_decompress_workflow,
                             args=(
                                 mgd.TempInputObj('url_{}'.format(i)),
                                 local_files[i].as_output(),
                             ),
                             kwargs={'local_download': local_download})

    concat_args = [
        'cat',
    ] + [x.as_input()
         for x in local_files] + ['>', mgd.OutputFile(out_file)]

    workflow.commandline(name='concat', args=concat_args)

    return workflow
Beispiel #2
0
def create_db_workflow(in_file,
                       ref_proteome_fasta_file,
                       out_file,
                       genome_version='GRCh37',
                       pyensembl_cache_dir=None):

    sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='clean_ref_fasta',
                       func=tasks.clean_ref_proteome_ids,
                       args=(mgd.InputFile(ref_proteome_fasta_file),
                             mgd.TempOutputFile('ref.fasta')))

    workflow.transform(name='build_variant_table',
                       func=tasks.build_variant_table,
                       args=(mgd.InputFile(in_file),
                             mgd.TempOutputFile('variant_table.tsv.gz')),
                       kwargs={
                           'genome_version': genome_version,
                           'pyensembl_cache_dir': pyensembl_cache_dir
                       })

    workflow.transform(name='build_variant_fasta',
                       func=tasks.build_variant_fasta,
                       args=(mgd.TempInputFile('variant_table.tsv.gz'),
                             mgd.TempOutputFile('var.fasta')))

    workflow.commandline(name='build_db',
                         args=('cat', mgd.TempInputFile('ref.fasta'),
                               mgd.TempInputFile('var.fasta'), '>',
                               mgd.OutputFile(out_file)))

    return workflow
Beispiel #3
0
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1):
    if check_chr_prefix(bam_file):
        chrom_str = 'chr6'
    else:
        chrom_str = '6'

    sandbox = soil.utils.workflow.get_sandbox(
        ['optitype', 'razers3', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools',
            'view',
            '-bh',
            '-f',
            '2',
            '-F',
            '4',
            mgd.InputFile(bam_file),
            chrom_str,
            '|',
            'samtools',
            'collate',
            '-O',
            '-',
            mgd.TempSpace('chr6_collate_temp'),
            '|',
            'samtools',
            'bam2fq',
            '-1',
            mgd.TempOutputFile('chr6_reads_1.fq'),
            '-2',
            mgd.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(name='optitype',
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_optitype,
                       args=(
                           mgd.TempInputFile('chr6_reads_1.fq'),
                           mgd.TempInputFile('chr6_reads_2.fq'),
                           mgd.OutputFile(hla_type_file),
                           mgd.TempSpace('optitype_temp'),
                       ),
                       kwargs={
                           'is_rna': is_rna,
                           'threads': threads,
                       })

    return workflow
Beispiel #4
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_dir,
                          out_bam_file,
                          add_xs_tag=False,
                          align_threads=1,
                          read_group_info=None,
                          sort_threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='star_align',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': align_threads
                       },
                       func=tasks.align,
                       args=(
                           mgd.InputFile(fastq_file_1),
                           mgd.InputFile(fastq_file_2),
                           ref_genome_dir,
                           mgd.TempOutputFile('aligned.bam'),
                           mgd.TempSpace('align_tmp'),
                       ),
                       kwargs={
                           'add_xs_tag': add_xs_tag,
                           'read_group_info': read_group_info,
                           'threads': align_threads,
                       })

    workflow.transform(name='sort',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': sort_threads
                       },
                       func=soil.wrappers.sambamba.tasks.sort,
                       args=(
                           mgd.TempInputFile('aligned.bam'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('sort_tmp'),
                       ),
                       kwargs={'threads': sort_threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
Beispiel #5
0
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('config', 'regions'),
        value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.commandline(
        name='run_mpileup',
        axes=('regions',),
        args=(
            'samtools',
            'mpileup',
            '-f', mgd.InputFile(ref_genome_fasta_file),
            '-o', mgd.TempOutputFile('region.mpileup', 'regions'),
            '-r', mgd.TempInputObj('config', 'regions'),
            mgd.InputFile(bam_file),
        )
    )

    workflow.transform(
        name='run_mpileup2snp',
        axes=('regions',),
        ctx=med_mem_ctx,
        func=tasks.mpileup2snp,
        args=(
            mgd.TempInputFile('region.mpileup', 'regions'),
            mgd.TempOutputFile('region.vcf', 'regions'),
        )
    )

    workflow.transform(
        name='compress',
        axes=('regions',),
        func=soil.wrappers.samtools.tasks.compress_vcf,
        args=(
            mgd.TempInputFile('region.vcf', 'regions'),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
        ),
    )

    workflow.transform(
        name='concatenate_vcfs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Beispiel #6
0
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
Beispiel #7
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_fasta_file,
                          out_bam_file,
                          threads=1):
    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.subworkflow(
        name='align',
        func=soil.wrappers.bwa.workflows.create_align_workflow,
        args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2),
              mgd.InputFile(ref_genome_fasta_file),
              mgd.TempOutputFile('aligned.bam')),
        kwargs={
            'align_threads': threads,
            'sort_threads': threads
        })

    workflow.transform(name='mark_dups',
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(mgd.TempInputFile('aligned.bam'),
                             mgd.OutputFile(out_bam_file),
                             mgd.TempSpace('mark_dups_tmp')),
                       kwargs={'threads': threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
Beispiel #8
0
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1):

    sandbox = soil.utils.workflow.get_sandbox([
        'mixcr',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(name='align',
                         ctx={
                             'mem': 32,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'align', '-f', '-t', threads,
                               mgd.InputFile(fastq_file_1),
                               mgd.InputFile(fastq_file_2),
                               mgd.TempOutputFile('alignments.vdjca')))

    workflow.commandline(name='assemble',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'assemble', '-f', '-t', 1,
                               mgd.TempInputFile('alignments.vdjca'),
                               mgd.TempOutputFile('clones.clns')))

    workflow.commandline(name='export',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3
                         },
                         args=('mixcr', 'exportClones', '-f',
                               mgd.TempInputFile('clones.clns'),
                               mgd.TempOutputFile('results.tsv')))

    workflow.commandline(name='compress',
                         args=('gzip', '-c', mgd.TempInputFile('results.tsv'),
                               '>', mgd.OutputFile(out_file)))

    return workflow
Beispiel #9
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes='default',
                            is_exome=False,
                            split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(
        ['bcftools', 'samtools', 'strelka'])

    workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'),
                    value=get_chromosomes(normal_bam_file,
                                          chromosomes=chromosomes))

    workflow.transform(
        name='count_fasta_bases',
        func=soil.wrappers.strelka.tasks.count_fasta_bases,
        args=(
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name='get_genome_size',
        ctx={'local': True},
        func=get_known_genome_size,
        ret=mgd.TempOutputObj('genome_size'),
        args=(
            mgd.InputFile(tumour_bam_file),
            mgd.TempInputFile('ref_base_counts.tsv'),
            chromosomes,
        ),
        sandbox=None,
    )

    workflow.transform(
        name='get_chromosome_depths',
        axes=('chrom_axis', ),
        func=soil.wrappers.strelka.tasks.get_chromosome_depth,
        args=(
            mgd.TempInputObj('chrom_names', 'chrom_axis'),
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'),
        ),
    )

    workflow.transform(
        name='merge_chromosome_depths',
        func=soil.wrappers.strelka.tasks.merge_chromosome_depth,
        args=(
            mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'),
            mgd.TempOutputFile('chrom_depth_merged.txt'),
        ),
        sandbox=None,
    )

    workflow.transform(name='call_genome_segment',
                       axes=('regions', ),
                       func=soil.wrappers.strelka.tasks.call_genome_segment,
                       args=(
                           mgd.TempInputFile('chrom_depth_merged.txt'),
                           mgd.InputFile(normal_bam_file),
                           mgd.InputFile(tumour_bam_file),
                           mgd.InputFile(ref_genome_fasta_file),
                           mgd.TempOutputFile('indels.vcf', 'regions'),
                           mgd.TempOutputFile('snvs.vcf', 'regions'),
                           mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                           mgd.TempInputObj('config', 'regions'),
                           mgd.TempInputObj('genome_size'),
                       ),
                       kwargs={
                           'is_exome': is_exome,
                       })

    workflow.transform(
        name='merge_indels',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('indels.vcf', 'regions'),
            mgd.TempOutputFile('indels.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_snvs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('snvs.vcf', 'regions'),
            mgd.TempOutputFile('snvs.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_all',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            [
                mgd.TempInputFile('indels.vcf.gz'),
                mgd.TempInputFile('snvs.vcf.gz')
            ],
            mgd.TempOutputFile('merged.vcf.gz'),
        ),
        kwargs={
            'allow_overlap': True,
        },
    )

    workflow.commandline(name='filter_vcf',
                         ctx=low_mem_ctx,
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.OutputFile(out_file),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    workflow.transform(name='index_vcf',
                       ctx=low_mem_ctx,
                       func=soil.wrappers.samtools.tasks.index_vcf,
                       args=(
                           mgd.InputFile(out_file),
                           mgd.OutputFile(out_file + '.tbi'),
                       ))

    return workflow
Beispiel #10
0
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file):
    """ Run EAGLE using a reference panel.
    """

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle'])

    workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom', 'chrom'),
        value=get_chromosomes(target_file)
    )

    workflow.transform(
        name='split_ref',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(ref_file),
            mgd.TempOutputFile('ref.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='split_target',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(target_file),
            mgd.TempOutputFile('target.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='run_eagle',
        axes=('chrom',),
        func=tasks.run_eagle,
        args=(
            mgd.InputFile(genetic_map_file),
            mgd.TempInputFile('ref.bcf', 'chrom'),
            mgd.TempInputFile('target.bcf', 'chrom'),
            mgd.TempOutputFile('phased.bcf', 'chrom'),
            mgd.TempSpace('eagle_tmp', 'chrom')
        )
    )

    workflow.transform(
        name='concat_results',
        func=tasks.concat_results,
        args=(
            mgd.TempInputFile('phased.bcf', 'chrom'),
            mgd.OutputFile(out_file)
        )
    )

    workflow.commandline(
        name='index',
        args=(
            'bcftools',
            'index',
            '-t',
            '-o', mgd.OutputFile(out_file + '.tbi'),
             mgd.InputFile(out_file)
        )
    )

    return workflow
Beispiel #11
0
def create_multiple_lane_align_workflow(fastq_files_1,
                                        fastq_files_2,
                                        ref_genome_dir,
                                        out_bam_file,
                                        add_xs_tag=False,
                                        align_threads=1,
                                        merge_threads=1,
                                        read_group_info=None,
                                        sort_threads=1):

    if read_group_info is None:
        read_group_info = {}

        for key in fastq_files_1:
            read_group_info[key] = None

    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'),
                    value=read_group_info)

    workflow.subworkflow(name='align',
                         axes=('lane', ),
                         func=create_align_workflow,
                         args=(
                             mgd.InputFile('R1.fq.gz',
                                           'lane',
                                           fnames=fastq_files_1),
                             mgd.InputFile('R2.fq.gz',
                                           'lane',
                                           fnames=fastq_files_2),
                             ref_genome_dir,
                             mgd.TempOutputFile('lane.bam', 'lane'),
                         ),
                         kwargs={
                             'add_xs_tag':
                             add_xs_tag,
                             'align_threads':
                             align_threads,
                             'read_group_info':
                             mgd.TempInputObj('read_group_info', 'lane'),
                             'sort_threads':
                             sort_threads,
                         })

    workflow.transform(name='markdups_and_merge',
                       axes=(),
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': merge_threads
                       },
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(
                           mgd.TempInputFile('lane.bam', 'lane'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('markdup_tmp'),
                       ),
                       kwargs={
                           'threads': merge_threads,
                       })

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))
    return workflow
Beispiel #12
0
def create_vardict_paired_workflow(normal_bam_file,
                                   tumour_bam_file,
                                   ref_genome_fasta_file,
                                   out_file,
                                   chromosomes=None,
                                   split_size=int(5e6)):

    sandbox = soil.utils.workflow.get_sandbox(
        ['bcftools', 'samtools', 'vardict', 'vardict-java'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.transform(name='run_vardict',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_vardict_paired,
                       args=(mgd.InputFile(normal_bam_file),
                             mgd.InputFile(tumour_bam_file),
                             mgd.InputFile(ref_genome_fasta_file),
                             mgd.TempInputObj('config', 'regions'),
                             mgd.TempOutputFile('call.tsv', 'regions')))

    workflow.transform(name='test_somatic',
                       axes=('regions', ),
                       func=tasks.run_test_somatic,
                       args=(mgd.TempInputFile('call.tsv', 'regions'),
                             mgd.TempOutputFile('somatic.tsv', 'regions')))

    workflow.transform(name='write_vcf',
                       axes=('regions', ),
                       func=tasks.run_build_paired_vcf,
                       args=(mgd.TempInputFile('somatic.tsv', 'regions'),
                             mgd.TempOutputFile('region.vcf', 'regions')))

    workflow.commandline(name='compress_vcf',
                         axes=('regions', ),
                         args=('bcftools', 'view', '-O', 'z', '-o',
                               mgd.TempOutputFile('region.vcf.gz', 'regions'),
                               mgd.TempInputFile('region.vcf', 'regions')))

    workflow.transform(name='concatenate_vcfs',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=(
                           mgd.TempInputFile('region.vcf.gz', 'regions'),
                           mgd.TempOutputFile('merged.vcf.gz'),
                       ))

    workflow.commandline(name='filter_vcf',
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.TempOutputFile('filtered.vcf.gz'),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    workflow.commandline(name='filter_somatics',
                         args=('bcftools', 'filter', '-i',
                               'INFO/STATUS[0]="StrongSomatic"', '-O', 'z',
                               '-o', mgd.OutputFile(out_file),
                               mgd.TempInputFile('filtered.vcf.gz')))

    return workflow
Beispiel #13
0
def create_mutect_paired_workflow(normal_bam_file,
                                  tumour_bam_file,
                                  ref_genome_fasta_file,
                                  out_file,
                                  chromosomes=None,
                                  normal_name='normal',
                                  split_size=int(1e7),
                                  tumour_name='tumour'):

    normal_name = get_sample(normal_bam_file, normal_name)

    tumour_name = get_sample(tumour_bam_file, tumour_name)

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'gatk', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.transform(name='run_mutect',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_mutect_paired,
                       args=(mgd.InputFile(normal_bam_file),
                             mgd.InputFile(tumour_bam_file),
                             mgd.InputFile(ref_genome_fasta_file),
                             mgd.TempInputObj('config', 'regions'),
                             mgd.TempOutputFile('region.vcf', 'regions')),
                       kwargs={
                           'normal_name': normal_name,
                           'tumour_name': tumour_name
                       })

    workflow.transform(name='run_mutect_filter',
                       axes=('regions', ),
                       ctx=med_mem_ctx,
                       func=tasks.run_filter_mutect,
                       args=(mgd.TempInputFile('region.vcf', 'regions'),
                             mgd.TempOutputFile('flagged.vcf', 'regions')))

    workflow.transform(name='concatenate_vcfs',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=(
                           mgd.TempInputFile('flagged.vcf', 'regions'),
                           mgd.TempOutputFile('merged.vcf.gz'),
                       ))

    workflow.commandline(name='filter_vcf',
                         ctx=low_mem_ctx,
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.OutputFile(out_file),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    return workflow
Beispiel #14
0
def create_index_ref_data_workflow(out_dir, cosmic=False, threads=1):
    """ Create index files for references.

    This workflow is extremely compute and memory heavy. It should be run on a cluster with large memory nodes
    available.
    """
    ref_data_paths = soil.ref_data.paths.SoilRefDataPaths(out_dir)

    sandbox = soil.utils.workflow.get_sandbox(
        ['bwa', 'bcftools', 'kallisto', 'picard', 'samtools', 'star'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(
        name='link_bwa_ref',
        args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file),
              mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file)))

    workflow.transform(
        name='bwa_index_ref_genome',
        ctx={
            'mem': 8,
            'mem_retry_increment': 8,
            'num_retry': 3
        },
        func=soil.wrappers.bwa.tasks.index,
        args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file),
              mgd.OutputFile(ref_data_paths.bwa_genome_fasta_file +
                             '.bwa_index.done')))

    workflow.subworkflow(
        name='build_bwa_mappability_file',
        func=tasks.mappability_wrapper,
        args=(mgd.InputFile(ref_data_paths.bwa_genome_fasta_file +
                            '.bwa_index.done'),
              mgd.OutputFile(ref_data_paths.genome_bwa_mappability_wig_file)),
        kwargs={
            'k': 100,
            'max_map_qual': 60,
            'threads': threads
        })

    workflow.commandline(
        name='link_star_ref',
        args=('ln', mgd.InputFile(ref_data_paths.genome_fasta_file),
              mgd.OutputFile(ref_data_paths.star_genome_fasta_file)))

    workflow.transform(
        name='star_index_ref_genome',
        ctx={
            'mem': 32,
            'mem_retry_increment': 16,
            'num_retry': 3,
            'threads': threads
        },
        func=soil.wrappers.star.tasks.index,
        args=(mgd.InputFile(ref_data_paths.star_genome_fasta_file),
              mgd.InputFile(ref_data_paths.gene_annotations_gtf_file),
              mgd.OutputFile(ref_data_paths.star_genome_fasta_file +
                             '.star_index.done')),
        kwargs={'threads': threads})

    workflow.transform(name='samtools_index_ref_genome',
                       func=soil.wrappers.samtools.tasks.index_fasta,
                       args=(mgd.InputFile(ref_data_paths.genome_fasta_file),
                             mgd.OutputFile(ref_data_paths.genome_fasta_file +
                                            '.fai')))

    workflow.commandline(
        name='build_ref_genom_dict',
        args=('picard', 'CreateSequenceDictionary', 'R={}'.format(
            mgd.InputFile(ref_data_paths.genome_fasta_file)), 'O={}'.format(
                mgd.OutputFile(
                    os.path.splitext(ref_data_paths.genome_fasta_file)[0] +
                    '.dict'))))

    workflow.transform(
        name='kallisto_index',
        ctx={
            'mem': 4,
            'mem_retry_increment': 4,
            'num_retry': 3
        },
        func=soil.wrappers.kallisto.tasks.build_index,
        args=(mgd.InputFile(ref_data_paths.transcriptome_fasta_file),
              mgd.OutputFile(ref_data_paths.kallisto_index_file)),
        kwargs={'kmer_length': 31})

    if cosmic:
        workflow.transform(
            name='index_cosmic',
            func=soil.wrappers.samtools.tasks.index_vcf,
            args=(mgd.InputFile(ref_data_paths.cosmic_vcf_file),
                  mgd.OutputFile(ref_data_paths.cosmic_vcf_file + '.tbi')))

    workflow.transform(name='index_dbsnp',
                       func=soil.wrappers.samtools.tasks.index_vcf,
                       args=(mgd.InputFile(ref_data_paths.dbsnp_vcf_file),
                             mgd.OutputFile(ref_data_paths.dbsnp_vcf_file +
                                            '.tbi')))

    return workflow
Beispiel #15
0
def create_allele_counts_workflow(normal_bam_file,
                                  tumour_bam_file,
                                  dbsnp_vcf_file,
                                  ref_genome_fasta_file,
                                  allele_counts_file,
                                  chromosomes='autosomes'):

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, chromosomes)

    sandbox = soil.utils.workflow.get_sandbox(['snpsift'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.subworkflow(
        name='call_snps',
        func=soil.wrappers.platypus.workflows.create_single_sample_workflow,
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('normal.vcf.gz'),
        ),
        kwargs={
            'chromosomes': chromosomes,
            'split_size': int(1e7)
        })

    workflow.commandline(name='annotate_dbsnp_status',
                         ctx={
                             'mem': 6,
                             'mem_retry_increment': 4,
                             'num_retry': 3
                         },
                         args=('SnpSift', 'annotate',
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.TempInputFile('normal.vcf.gz'), '>',
                               mgd.TempOutputFile('normal.dbsnp.vcf')))

    workflow.commandline(name='annotate_variant_type',
                         ctx={
                             'mem': 6,
                             'mem_retry_increment': 4,
                             'num_retry': 3
                         },
                         args=('SnpSift', 'varType',
                               mgd.TempInputFile('normal.dbsnp.vcf'), '>',
                               mgd.TempOutputFile('normal.dbsnp.vartype.vcf')))

    workflow.commandline(
        name='filter_het_snps',
        ctx={
            'mem': 6,
            'mem_retry_increment': 4,
            'num_retry': 3
        },
        args=('SnpSift', 'filter',
              "isHet(GEN[0]) & ((exists ID) & ( ID =~ 'rs' )) & (exists SNP)",
              mgd.TempInputFile('normal.dbsnp.vartype.vcf'), '>',
              mgd.TempOutputFile('het.snps.vcf')))

    workflow.transform(name='split_vcf',
                       ctx={
                           'mem': 6,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.split_vcf,
                       args=(mgd.TempInputFile('het.snps.vcf'),
                             mgd.TempOutputFile('split.vcf', 'split'),
                             mgd.TempSpace('split_tmp')),
                       kwargs={'split_size': int(1e4)})

    workflow.transform(name='get_allele_counts',
                       axes=('split', ),
                       func=tasks.get_snv_allele_counts_for_vcf_targets,
                       args=(mgd.InputFile(tumour_bam_file),
                             mgd.TempInputFile('split.vcf', 'split'),
                             mgd.TempOutputFile('split.tsv', 'split')))

    workflow.transform(name='merge_counts',
                       func=tasks.merge_counts,
                       args=(mgd.TempInputFile('split.tsv', 'split'),
                             mgd.OutputFile(allele_counts_file)))

    return workflow
Beispiel #16
0
def create_rnaseq_workflow(fastq_file_1, fastq_file_2, out_file, threads=1):

    sandbox = soil.utils.workflow.get_sandbox([
        'mixcr',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(name='align',
                         ctx={
                             'mem': 32,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'align', '-p', 'rna-seq', '-s', 'hsa',
                               '-OallowPartialAlignments=true', '-f', '-t',
                               threads, mgd.InputFile(fastq_file_1),
                               mgd.InputFile(fastq_file_2),
                               mgd.TempOutputFile('alignments.vdjca')))

    workflow.commandline(
        name='assemblePartial_1',
        ctx={
            'mem': 16,
            'mem_retry_increment': 8,
            'num_retry': 3
        },
        args=('mixcr', 'assemblePartial', '-f',
              mgd.TempInputFile('alignments.vdjca'),
              mgd.TempOutputFile('alignments_rescued_1.vdjca')))

    workflow.commandline(
        name='assemblePartial_2',
        ctx={
            'mem': 16,
            'mem_retry_increment': 8,
            'num_retry': 3
        },
        args=('mixcr', 'assemblePartial', '-f',
              mgd.TempInputFile('alignments_rescued_1.vdjca'),
              mgd.TempOutputFile('alignments_rescued_2.vdjca')))

    workflow.commandline(
        name='extendAlignments',
        ctx={
            'mem': 16,
            'mem_retry_increment': 8,
            'num_retry': 3
        },
        args=('mixcr', 'extendAlignments', '-f',
              mgd.TempInputFile('alignments_rescued_2.vdjca'),
              mgd.TempOutputFile('alignments_rescued_2_extended.vdjca')))

    workflow.commandline(
        name='assemble',
        ctx={
            'mem': 16,
            'mem_retry_increment': 8,
            'num_retry': 3,
            'threads': threads
        },
        args=('mixcr', 'assemble', '-f', '-t', threads,
              mgd.TempInputFile('alignments_rescued_2_extended.vdjca'),
              mgd.TempOutputFile('clones.clns')))

    workflow.commandline(name='export',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3
                         },
                         args=('mixcr', 'exportClones', '-f',
                               mgd.TempInputFile('clones.clns'),
                               mgd.TempOutputFile('results.tsv')))

    workflow.commandline(name='compress',
                         args=('gzip', '-c', mgd.TempInputFile('results.tsv'),
                               '>', mgd.OutputFile(out_file)))

    return workflow
Beispiel #17
0
def create_titan_workflow(normal_bam_file,
                          tumour_bam_file,
                          dbsnp_vcf_file,
                          mappability_file,
                          ref_genome_fasta_file,
                          out_file,
                          exome_bed_file=None,
                          sample='Tumour',
                          threads=1):

    sandbox = soil.utils.workflow.get_sandbox(
        ['hmmcopy', 'hmmcopy_utils', 'titan'])

    sandbox.channels.append('conda-forge')

    sandbox.packages.extend(['pandas', 'rpy2'])

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, 'autosomes')

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'),
                    value=tasks.create_intialization_parameters())

    workflow.subworkflow(name='get_allele_counts',
                         func=create_allele_counts_workflow,
                         args=(mgd.InputFile(normal_bam_file),
                               mgd.InputFile(tumour_bam_file),
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.TempOutputFile('allele_counts.tsv')),
                         kwargs={'chromosomes': 'autosomes'})

    workflow.commandline(name='build_normal_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(normal_bam_file), '>',
                               mgd.TempOutputFile('normal.wig')))

    workflow.commandline(name='build_tumour_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(tumour_bam_file), '>',
                               mgd.TempOutputFile('tumour.wig')))

    workflow.commandline(name='build_gc_wig',
                         args=('gcCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(ref_genome_fasta_file), '>',
                               mgd.TempOutputFile('gc.wig')))

    workflow.commandline(name='build_mappability_wig',
                         args=('mapCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(mappability_file), '>',
                               mgd.TempOutputFile('mappability.wig')))

    workflow.transform(name='build_coverage_file',
                       func=tasks.build_coverage_file,
                       args=(mgd.TempInputFile('normal.wig'),
                             mgd.TempInputFile('tumour.wig'),
                             mgd.TempInputFile('gc.wig'),
                             mgd.TempInputFile('mappability.wig'),
                             mgd.TempOutputFile('coverage.wig')),
                       kwargs={'target_file': exome_bed_file})

    workflow.transform(name='run_titan',
                       axes=('param_idx', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_titan,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('run.tar.gz', 'param_idx'),
                             mgd.TempSpace('titan_tmp', 'param_idx')),
                       kwargs={
                           'is_exome': (exome_bed_file is not None),
                           'sample': sample,
                           'threads': threads
                       })

    workflow.transform(name='build_run_stats_file',
                       func=tasks.build_run_stats_file,
                       args=(mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('stats.tsv')))

    workflow.transform(name='build_output',
                       func=tasks.build_final_results_file,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputFile('stats.tsv'),
                             mgd.OutputFile(out_file),
                             mgd.TempSpace('build_results')))

    return workflow
Beispiel #18
0
def create_mappability_workflow(
        ref_genome_fasta_file,
        out_file,
        k=100,
        max_map_qual=None,
        split_size=int(1e7),
        threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['bwa', 'samtools', 'ucsc-bedgraphtobigwig'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(
        name='split_fasta_by_chrom',
        func=tasks.split_fasta_by_chrom,
        args=(
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('chrom.fasta', 'chrom')
        )
    )

    workflow.transform(
        name='create_kmer_reads',
        axes=('chrom',),
        ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3},
        func=tasks.create_kmer_reads,
        args=(
            mgd.TempInputFile('chrom.fasta', 'chrom'),
            mgd.TempOutputFile('reads.fa', 'chrom', 'kmer_group')
        ),
        kwargs={
            'k': k,
            'split_size': split_size
        }
    )

    workflow.transform(
        name='align_kmers',
        axes=('chrom', 'kmer_group'),
        ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3, 'threads': threads},
        func=tasks.bwa_mem_align,
        args=(
            mgd.TempInputFile('reads.fa', 'chrom', 'kmer_group'),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('aligned.bam', 'chrom', 'kmer_group')
        ),
        kwargs={
            'threads': threads
        }
    )

    workflow.transform(
        name='compute_mappability',
        axes=('chrom', 'kmer_group'),
        ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3},
        func=tasks.compute_mappability,
        args=(
            mgd.TempInputFile('aligned.bam', 'chrom', 'kmer_group'),
            mgd.TempOutputFile('mappability.tsv', 'chrom', 'kmer_group')
        ),
        kwargs={
            'max_map_qual': max_map_qual,
        }
    )

    workflow.transform(
        name='compute_mappability_segs',
        axes=('chrom', 'kmer_group'),
        ctx={'mem': 4, 'mem_retry_increment': 2, 'num_retry': 3},
        func=tasks.compute_mappability_segs,
        args=(
            mgd.TempInputFile('mappability.tsv', 'chrom', 'kmer_group'),
            mgd.TempOutputFile('mappability_segs.tsv', 'chrom', 'kmer_group')
        )
    )

    workflow.transform(
        name='compute_chrom_mean_mappability',
        axes=('chrom',),
        ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3},
        func=tasks.compute_chrom_mean_mappability,
        args=(
            mgd.TempInputFile('mappability_segs.tsv', 'chrom', 'kmer_group'),
            mgd.TempOutputFile('mean_mappability.tsv', 'chrom')
        )
    )

    workflow.transform(
        name='write_bed',
        ctx={'mem': 8, 'mem_retry_increment': 8, 'num_retry': 3},
        func=tasks.write_bed,
        args=(
            mgd.TempInputFile('mean_mappability.tsv', 'chrom'),
            mgd.TempOutputFile('mean_mappability.bed')
        )
    )

    workflow.transform(
        name='write_chrom_sizes',
        func=tasks.write_chrom_sizes,
        args=(
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('chrom_sizes.txt'),
        )
    )

    workflow.commandline(
        name='write_big_wig',
        args=(
            'bedGraphToBigWig',
            mgd.TempInputFile('mean_mappability.bed'),
            mgd.TempInputFile('chrom_sizes.txt'),
            mgd.OutputFile(out_file)
        )
    )

    return workflow