Esempio n. 1
0
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
Esempio n. 2
0
def create_variant_counting_workflow(
    vcfs,
    tumour_cell_bams,
    results_h5,
    config,
):
    """ Count variant reads for multiple sets of variants across cells.
    """

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=tumour_cell_bams.keys(),
    )

    workflow.transform(name='merge_snvs',
                       func='biowrappers.components.io.vcf.tasks.merge_vcfs',
                       args=([mgd.InputFile(vcf) for vcf in vcfs],
                             mgd.TempOutputFile('all.snv.vcf')))

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi'])),
                       kwargs={
                           'docker_config':
                           helpers.get_container_ctx(config['containers'],
                                                     'vcftools')
                       })

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            config,
            mgd.InputFile('tumour_cells.bam',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams),
            mgd.TempInputFile('all.snv.vcf.gz'),
            mgd.OutputFile(results_h5),
        ),
        kwargs={
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        },
    )

    return workflow
Esempio n. 3
0
def create_consensus_workflow(destruct_breakpoints, lumpy_vcf, output,
                              chromosomes):

    params = config.default_params('breakpoint_calling')
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_lumpy',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func=
        'wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task',
        args=(
            mgd.InputFile(lumpy_vcf),
            mgd.TempOutputFile('lumpy.csv'),
            params["parse_lumpy"],
        ),
        kwargs={'chromosomes': chromosomes})

    workflow.transform(
        name='parse_destruct',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func=
        'wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task',
        args=(
            mgd.InputFile(destruct_breakpoints),
            mgd.TempOutputFile('destruct.csv'),
            params["parse_destruct"],
        ),
        kwargs={'chromosomes': chromosomes})

    workflow.transform(
        name='consensus_breakpoint_calling',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls',
        args=(mgd.TempInputFile('destruct.csv'),
              mgd.TempInputFile('lumpy.csv'), mgd.OutputFile(output),
              params['consensus']),
    )

    return workflow
Esempio n. 4
0
def run_MutationSeq(config, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X'])))

    workflow.transform(
        name='run_museq_paired',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'},
        axes=('interval',),
        func=tasks.run_museq,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.InputFile(tumour_bam),
            mgd.InputInstance('interval'),
            mgd.TempOutputFile('museq.vcf', 'interval'),
            mgd.TempOutputFile('museq.log', 'interval'),
            )
        )

    workflow.transform(
        name='merge_vcfs',
        func=tasks.merge_vcfs,
        args=(
            mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]),
            mgd.OutputFile(output_file),
            mgd.TempSpace('merge_vcf'),
            )
        )

    return workflow
Esempio n. 5
0
def _create_download_decompress_workflow(url,
                                         local_path,
                                         local_download=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(mgd.TempOutputObj('url'), value=url)

    workflow.transform(
        name='download',
        ctx={'local': local_download},
        func=tasks.download,
        args=(
            mgd.TempInputObj('url'),
            mgd.TempOutputFile('download'),
        ),
    )

    workflow.transform(name='decompress',
                       func=tasks.decompress,
                       args=(
                           mgd.TempInputFile('download'),
                           mgd.OutputFile(local_path),
                       ))

    return workflow
Esempio n. 6
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
Esempio n. 7
0
def create_vcf2maf_workflow(vcf_file,
                            maf_file,
                            reference,
                            tumour_id=None,
                            normal_id=None):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='vcf2maf',
                       func='wgs.workflows.vcf2maf.tasks.run_vcf2maf',
                       args=(mgd.InputFile(vcf_file),
                             mgd.TempOutputFile('maf_file.maf'),
                             mgd.TempSpace('vcf2maf_temp'), reference),
                       kwargs={
                           'tumour_id': tumour_id,
                           'normal_id': normal_id
                       })

    workflow.transform(name='update_ids',
                       func='wgs.workflows.vcf2maf.tasks.update_ids',
                       args=(
                           mgd.TempInputFile('maf_file.maf'),
                           tumour_id,
                           normal_id,
                           mgd.OutputFile(maf_file),
                       ))

    return workflow
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams,
                          lumpy_breakpoints_csv, lumpy_breakpoints_evidence,
                          lumpy_breakpoints_bed):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.subworkflow(
        name='normal_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(normal_bam, config,
              mgd.TempOutputFile('normal.discordants.sorted.bam'),
              mgd.TempOutputFile('normal.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_normal_formatted.csv'),
              mgd.TempOutputFile('normal_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='tumour_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(mgd.InputFile('tumour_cells.bam',
                            'cell_id',
                            extensions=['.bai'],
                            fnames=tumour_cell_bams), config,
              mgd.TempOutputFile('tumour.discordants.sorted.bam'),
              mgd.TempOutputFile('tumour.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_tumour_formatted.csv'),
              mgd.TempOutputFile('tumour_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='lumpy',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        func="single_cell.workflows.lumpy.lumpy_calling_workflow",
        args=(
            config,
            mgd.TempInputFile('normal.discordants.sorted.bam'),
            mgd.TempInputFile('normal.splitters.sorted.bam'),
            mgd.TempInputFile('hist_normal_formatted.csv'),
            mgd.TempInputFile('normal_mean_stdev.yaml'),
            mgd.TempInputFile('tumour.discordants.sorted.bam'),
            mgd.TempInputFile('tumour.splitters.sorted.bam'),
            mgd.TempInputFile('hist_tumour_formatted.csv'),
            mgd.TempInputFile('tumour_mean_stdev.yaml'),
            mgd.OutputFile(lumpy_breakpoints_bed),
            mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']),
            mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']),
        ),
    )

    return workflow
Esempio n. 9
0
def create_fit_model_workflow(
    experiment_filename,
    results_filename,
    config,
    ref_data_dir,
    tumour_id=None,
):
    config = remixt.config.get_sample_config(config, tumour_id)
    
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16})

    workflow.transform(
        name='init',
        func=remixt.analysis.pipeline.init,
        ret=mgd.TempOutputObj('init_params', 'init_id'),
        args=(
            mgd.TempOutputFile('init_results'),
            mgd.InputFile(experiment_filename),
            config,
        ),
    )

    workflow.transform(
        name='fit',
        axes=('init_id',),
        func=remixt.analysis.pipeline.fit_task,
        args=(
            mgd.TempOutputFile('fit_results', 'init_id'),
            mgd.InputFile(experiment_filename),
            mgd.TempInputObj('init_params', 'init_id'),
            config,
        ),
    )

    workflow.transform(
        name='collate',
        func=remixt.analysis.pipeline.collate,
        args=(
            mgd.OutputFile(results_filename),
            mgd.InputFile(experiment_filename),
            mgd.TempInputFile('init_results'),
            mgd.TempInputFile('fit_results', 'init_id'),
            config,
        ),
    )

    return workflow
Esempio n. 10
0
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1):

    sandbox = soil.utils.workflow.get_sandbox([
        'mixcr',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(name='align',
                         ctx={
                             'mem': 32,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'align', '-f', '-t', threads,
                               mgd.InputFile(fastq_file_1),
                               mgd.InputFile(fastq_file_2),
                               mgd.TempOutputFile('alignments.vdjca')))

    workflow.commandline(name='assemble',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'assemble', '-f', '-t', 1,
                               mgd.TempInputFile('alignments.vdjca'),
                               mgd.TempOutputFile('clones.clns')))

    workflow.commandline(name='export',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3
                         },
                         args=('mixcr', 'exportClones', '-f',
                               mgd.TempInputFile('clones.clns'),
                               mgd.TempOutputFile('results.tsv')))

    workflow.commandline(name='compress',
                         args=('gzip', '-c', mgd.TempInputFile('results.tsv'),
                               '>', mgd.OutputFile(out_file)))

    return workflow
Esempio n. 11
0
def create_snpeff_annotation_workflow(db,
                                      data_dir,
                                      target_vcf_file,
                                      out_file,
                                      classic_mode=True,
                                      split_size=int(1e3),
                                      table_name='snpeff'):
    ctx = {'num_retry': 3, 'mem_retry_increment': 2}

    workflow = Workflow()

    workflow.transform(name='split_vcf',
                       ctx=dict(mem=2, **ctx),
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(target_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='run_snpeff',
        axes=('split', ),
        ctx=dict(mem=8, **ctx),
        func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff',
        args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('snpeff.vcf', 'split')),
        kwargs={
            'classic_mode': classic_mode,
        })

    workflow.transform(
        name='convert_vcf_to_csv',
        axes=('split', ),
        ctx=dict(mem=4, **ctx),
        func=
        'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table',
        args=(mgd.TempInputFile('snpeff.vcf', 'split'),
              mgd.TempOutputFile('snpeff.csv.gz', 'split'), table_name))

    workflow.transform(
        name='concatenate_tables',
        ctx=dict(mem=4, **ctx),
        func='biowrappers.components.io.csv.tasks.concatenate_csv',
        args=(mgd.TempInputFile('snpeff.csv.gz',
                                'split'), mgd.OutputFile(out_file)))

    return workflow
Esempio n. 12
0
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints,
           circos_plot_remixt, circos_plot_titan):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='prep_titan',
        func='wgs_qc_utils.reader.read_titan.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(titan_calls),
            mgd.TempOutputFile("titan_prepped"),
        )
    )

    workflow.transform(
        name='prep_remixt',
        func='wgs_qc_utils.reader.read_remixt.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(remixt_calls),
            sample_id,
            mgd.TempOutputFile("remixt_prepped"),
        )
    )
    workflow.transform(
        name='circos_plot',
        func='wgs.workflows.sample_qc.tasks.circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.TempInputFile("titan_prepped"),
            mgd.TempInputFile("remixt_prepped"),
            sample_id,
            breakpoints,
            mgd.OutputFile(circos_plot_remixt),
            mgd.OutputFile(circos_plot_titan),
            mgd.TempSpace("circos")
        )
    )

    return workflow
Esempio n. 13
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_dir,
                          out_bam_file,
                          add_xs_tag=False,
                          align_threads=1,
                          read_group_info=None,
                          sort_threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='star_align',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': align_threads
                       },
                       func=tasks.align,
                       args=(
                           mgd.InputFile(fastq_file_1),
                           mgd.InputFile(fastq_file_2),
                           ref_genome_dir,
                           mgd.TempOutputFile('aligned.bam'),
                           mgd.TempSpace('align_tmp'),
                       ),
                       kwargs={
                           'add_xs_tag': add_xs_tag,
                           'read_group_info': read_group_info,
                           'threads': align_threads,
                       })

    workflow.transform(name='sort',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': sort_threads
                       },
                       func=soil.wrappers.sambamba.tasks.sort,
                       args=(
                           mgd.TempInputFile('aligned.bam'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('sort_tmp'),
                       ),
                       kwargs={'threads': sort_threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
Esempio n. 14
0
def create_destruct_wrapper_workflow(bam_filenames,
                                     output_filename,
                                     raw_data_dir,
                                     control_id=None,
                                     config=None,
                                     ref_data_dir=None):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=list(bam_filenames.keys()),
    )

    workflow.subworkflow(
        name='run_destruct',
        func=destruct.workflow.create_destruct_workflow,
        args=(
            mgd.InputFile('bam', 'sample_id', fnames=bam_filenames),
            mgd.TempOutputFile('breakpoint_table'),
            mgd.TempOutputFile('breakpoint_library_table'),
            mgd.TempOutputFile('breakpoint_read_table'),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_data_dir,
        },
    )

    workflow.transform(
        name='post_process',
        func=destruct.benchmark.wrappers.destruct.tasks.destruct_postprocess,
        args=(
            mgd.TempInputFile('breakpoint_table'),
            mgd.TempInputFile('breakpoint_library_table'),
            mgd.OutputFile(output_filename),
        ),
        kwargs={
            'control_id': control_id,
        })

    return workflow
Esempio n. 15
0
def pre_alignment(fastq_r1, fastq_r2, metrics_tar):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.TempOutputFile('R1.html'),
            mgd.TempOutputFile('R1.pdf'),
            mgd.TempSpace('fastqc_R1'),
        ),
        kwargs={
            'docker_image': config.containers("fastqc"),
        })

    workflow.transform(
        name="fastqc_r2",
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.TempOutputFile('R2.html'),
            mgd.TempOutputFile('R2.pdf'),
            mgd.TempSpace('fastqc_R2'),
        ),
        kwargs={
            'docker_image': config.containers('fastqc'),
        })

    workflow.transform(name='tar',
                       func='alignment.utils.helpers.make_tar_from_files',
                       axes=('sample_id', ),
                       args=(mgd.OutputFile(metrics_tar), [
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                       ], mgd.TempSpace('wgs_metrics')))

    return workflow
Esempio n. 16
0
def create_vcf_mappability_annotation_workflow(
        mappability_file,
        vcf_file,
        out_file,
        chromosomes=default_chromosomes,
        split_size=int(1e7),
):

    ctx = {'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='get_regions',
        ret=mgd.TempOutputObj('regions_obj', 'regions'),
        ctx=ctx,
        func='biowrappers.components.variant_calling.utils.get_vcf_regions',
        args=(
            mgd.InputFile(vcf_file, extensions=['.tbi']),
            split_size,
        ),
        kwargs={
            'chromosomes': chromosomes,
        },
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('regions',),
        ctx=ctx,
        func='biowrappers.components.variant_calling.mappability.tasks.get_mappability',
        args=(
            mappability_file,
            mgd.InputFile(vcf_file, extensions=['.tbi']),
            mgd.TempOutputFile('mappability.csv.gz', 'regions')
        ),
        kwargs={
            'region': mgd.TempInputObj('regions_obj', 'regions'),
        },
    )

    workflow.transform(
        name='merge_tables',
        ctx=ctx,
        func='biowrappers.components.io.csv.tasks.concatenate_csv',
        args=(
            mgd.TempInputFile('mappability.csv.gz', 'regions'),
            mgd.OutputFile(out_file)
        )
    )

    return workflow
Esempio n. 17
0
def create_trinuc_annotation_workflow(
        in_vcf_file,
        out_csv_file,
        ref_genome,
        split_size=int(1e4),
):
    workflow = pypeliner.workflow.Workflow(ctx={
        'num_retry': 3,
        'mem_retry_increment': 2
    })

    workflow.transform(name='split_vcf',
                       func='single_cell.utils.vcfutils.split_vcf',
                       args=(mgd.InputFile(in_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='annotate_db_status',
        axes=('split', ),
        func=
        'single_cell.workflows.trinuc_annotation.tasks.get_tri_nucelotide_context',
        args=(
            ref_genome,
            mgd.TempInputFile('split.vcf', 'split'),
            mgd.TempOutputFile('tri_nucleotide_context.csv.gz',
                               'split',
                               extensions=['.yaml']),
        ))

    workflow.transform(name='merge_tables',
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('tri_nucleotide_context.csv.gz',
                                               'split',
                                               extensions=['.yaml']),
                             mgd.OutputFile(out_csv_file,
                                            extensions=['.yaml'])))

    return workflow
Esempio n. 18
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes=None,
                            split_size=int(1e7)):

    regions = utils.get_bam_regions(normal_bam_file,
                                    split_size,
                                    chromosomes=chromosomes)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=regions)

    workflow.transform(
        name='run_somatic',
        axes=('regions', ),
        ctx={
            'mem': 6,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=tasks.run_somatic,
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(tumour_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
            mgd.TempInputObj('config', 'regions'),
            mgd.TempSpace('varscan_tmp', 'regions'),
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={
            'mem': 2,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 19
0
def create_vcf_tric_nucleotide_annotation_workflow(
        ref_genome_fasta_file,
        vcf_file,
        out_file,
        split_size=int(1e4),
        table_name='tri_nucleotide_context'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}

    merged_file = mgd.TempFile('merged.csv.gz')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='split_vcf',
                       ctx=dict(mem=2, **ctx),
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='annotate_db_status',
        axes=('split', ),
        ctx=dict(mem=4, **ctx),
        func=
        'biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context',
        args=(ref_genome_fasta_file, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('tri_nucleotide_context.csv.gz',
                                 'split'), table_name))

    workflow.transform(
        name='merge_tables',
        ctx=dict(mem=2, **ctx),
        func='biowrappers.components.io.csv.tasks.concatenate_csv',
        args=(mgd.TempInputFile('tri_nucleotide_context.csv.gz',
                                'split'), mgd.OutputFile(out_file)))

    return workflow
Esempio n. 20
0
def create_workflow_2(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    workflow.transform(name='dofilestuff1',
                       func=do_file_stuff,
                       args=(mgd.InputFile(input_filename),
                             mgd.TempOutputFile('intermediate1'), 'a'))

    workflow.transform(name='dofilestuff2',
                       func=do_file_stuff,
                       args=(mgd.TempInputFile('intermediate1'),
                             mgd.OutputFile(output_filename), 'b'))

    return workflow
Esempio n. 21
0
def create_snv_allele_counts_workflow(
        bam_file,
        out_file,
        table_name,
        chromosomes=default_chromosomes,
        count_duplicates=False,
        min_bqual=0,
        min_mqual=0,
        report_non_variant_positions=True,
        report_zero_count_positions=False,
        split_size=int(1e7)):

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.TempOutputObj('regions_obj', 'regions'),
        value=biowrappers.components.variant_calling.utils.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.transform(
        name='get_counts',
        axes=('regions',),
        ctx=med_ctx,
        func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_region',
        args=(
            mgd.InputFile(bam_file),
            mgd.TempOutputFile('counts.h5', 'regions'),
            mgd.TempInputObj('regions_obj', 'regions'),
            table_name
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'report_non_variant_positions': report_non_variant_positions,
            'report_zero_count_positions': report_zero_count_positions
        }
    )

    workflow.transform(
        name='concatenate_counts',
        ctx=med_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(
            mgd.TempInputFile('counts.h5', 'regions'),
            mgd.OutputFile(out_file)
        )
    )

    return workflow
Esempio n. 22
0
def run_VarScan(config, normal_bam, tumour_bam, snp_output_file, indel_output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='generate_normal_mpileup',
        func=tasks.generate_mpileup,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.TempOutputFile("normal.pileup"),
            )
        )

    workflow.transform(
        name='generate_tumour_mpileup',
        func=tasks.generate_mpileup,
        args=(
            config,
            mgd.InputFile(tumour_bam),
            mgd.TempOutputFile("tumour.pileup"),
            )
        )

    workflow.transform(
        name='run_varscan_somatic',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '08:00'},
        func=tasks.run_varscan_somatic,
        args=(
            config,
            mgd.TempInputFile("normal.pileup"),
            mgd.TempInputFile("tumour.pileup"),
            mgd.OutputFile(snp_output_file),
            mgd.OutputFile(indel_output_file),
            )
        )

    return workflow
Esempio n. 23
0
def create_cohort_oncoplot(config, merged_germline, merged_somatic,
                           maftools_cna, maftools_maf, oncoplot, report,
                           cohort):
    """create oncoplot from cna table, and germlinne/somatic dataa.
    Args:
        config ([dict]): [config]
        merged_germline ([str]): [path to merged germline file]
        merged_somatic ([str]): [path to merged somatic file]
        maftools_cna ([str]): [path to merged cna data]
        maftools_maf ([sstr]): [path to output prepped maftools input maf]
        oncoplot ([str]): [path to output oncoplot]
    Returns:
        [type]: [description]
    """

    ctx = {
        'mem': config["memory"]['low'],
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1
    }

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    non_synonymous_labels = config["non_synonymous_labels"]

    workflow.transform(
        name='postprocess_maf',
        func='single_cell.workflows.cohort_qc.tasks.prepare_maf_for_maftools',
        args=(mgd.InputFile(merged_germline), mgd.InputFile(merged_somatic),
              mgd.OutputFile(maftools_maf), non_synonymous_labels,
              mgd.TempOutputFile("vcNames")),
    )

    workflow.transform(
        name='make_oncoplot',
        func='single_cell.workflows.cohort_qc.tasks.make_oncoplot',
        args=(mgd.InputFile(maftools_maf), mgd.InputFile(maftools_cna),
              mgd.OutputFile(oncoplot), mgd.TempInputFile("vcNames")),
    )

    workflow.transform(
        name='create_report',
        func='single_cell.workflows.cohort_qc.tasks.create_report',
        args=(cohort, mgd.InputFile(oncoplot), report),
    )
    return workflow
Esempio n. 24
0
def create_museq_workflow(
        normal_bam, tumour_bam, ref_genome, snv_vcf,
        config):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=normal_bam.keys(),
    )

    workflow.transform(
        name='run_museq',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        axes=('region',),
        func='single_cell.workflows.mutationseq.tasks.run_museq',
        args=(
            mgd.InputFile('merged_bam', 'region', fnames=tumour_bam),
            mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam),
            mgd.TempOutputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.log', 'region'),
            mgd.InputInstance('region'),
            config,
        ),
        kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')}
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.TempInputFile('museq.vcf', 'region'),
            mgd.OutputFile(snv_vcf),
        ),
    )

    return workflow
Esempio n. 25
0
def create_extract_seqdata_workflow(
    bam_filename,
    seqdata_filename,
    remixt_config,
    remixt_ref_data_dir,
    config,
    multiprocess=False,
):
    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='create_chromosome_seqdata',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func=
        "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata",
        args=(
            mgd.TempOutputFile('seqdata', 'chromosome'),
            mgd.InputFile(bam_filename),
            remixt_config,
            remixt_ref_data_dir,
        ),
        kwargs={
            'multiprocess': multiprocess,
            'ncores': config['max_cores']
        })

    workflow.transform(
        name='merge_seqdata',
        ctx=dict(mem=config["memory"]['high'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        func="remixt.seqdataio.merge_seqdata",
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.TempInputFile('seqdata', 'chromosome'),
        ),
    )

    return workflow
Esempio n. 26
0
def create_mappability_annotation_workflow(
        in_vcf_file,
        out_csv_file,
        mappability_file,
        split_size=1e4
):
    workflow = pypeliner.workflow.Workflow(
        ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions",
        ret=mgd.TempOutputObj('regions_obj', 'regions'),
        args=(
            mgd.InputFile(in_vcf_file, extensions=['.tbi']),
            int(split_size),
        ),
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('regions',),
        func='single_cell.workflows.mappability_annotation.tasks.get_mappability',
        args=(
            mappability_file,
            mgd.InputFile(in_vcf_file, extensions=['.tbi']),
            mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml'])
        ),
        kwargs={
            'region': mgd.TempInputObj('regions_obj', 'regions'),
        },
    )

    workflow.transform(
        name='merge_tables',
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']),
            mgd.OutputFile(out_csv_file, extensions=['.yaml'])
        )
    )

    return workflow
Esempio n. 27
0
def create_extract_seqdata_workflow(
     bam_filename,
     seqdata_filename,
     config,
     ref_data_dir,
):
    chromosomes = remixt.config.get_chromosomes(config, ref_data_dir)
    snp_positions_filename = remixt.config.get_filename(config, ref_data_dir, 'snp_positions')

    bam_max_fragment_length = remixt.config.get_param(config, 'bam_max_fragment_length')
    bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped')
    bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)

    workflow.transform(
        name='create_chromosome_seqdata',
        axes=('chromosome',),
        ctx={'mem': 16},
        func=remixt.seqdataio.create_chromosome_seqdata,
        args=(
            mgd.TempOutputFile('seqdata', 'chromosome'),
            mgd.InputFile(bam_filename),
            mgd.InputFile(snp_positions_filename),
            mgd.InputInstance('chromosome'),
            bam_max_fragment_length,
            bam_max_soft_clipped,
            bam_check_proper_pair,
        ),
    )

    workflow.transform(
        name='merge_seqdata',
        ctx={'mem': 16},
        func=remixt.seqdataio.merge_seqdata,
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.TempInputFile('seqdata', 'chromosome'),
        ),
    )

    return workflow
Esempio n. 28
0
def create_workflow_2(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='dofilestuff1',
        func='pypeliner.tests.tasks.do_file_stuff',
        args=(
            mgd.InputFile(input_filename),
            mgd.TempOutputFile('intermediate1'),
            'a'))

    workflow.transform(
        name='dofilestuff2',
        func='pypeliner.tests.tasks.do_file_stuff',
        args=(
            mgd.TempInputFile('intermediate1'),
            mgd.OutputFile(output_filename),
            'b'))

    return workflow
Esempio n. 29
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_fasta_file,
                          out_bam_file,
                          threads=1):
    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.subworkflow(
        name='align',
        func=soil.wrappers.bwa.workflows.create_align_workflow,
        args=(mgd.InputFile(fastq_file_1), mgd.InputFile(fastq_file_2),
              mgd.InputFile(ref_genome_fasta_file),
              mgd.TempOutputFile('aligned.bam')),
        kwargs={
            'align_threads': threads,
            'sort_threads': threads
        })

    workflow.transform(name='mark_dups',
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(mgd.TempInputFile('aligned.bam'),
                             mgd.OutputFile(out_bam_file),
                             mgd.TempSpace('mark_dups_tmp')),
                       kwargs={'threads': threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
Esempio n. 30
0
def download_external_files(config):
    download_keys = [x for x in config if 'url' in config[x]]
    urls = dict(zip(
        download_keys,
        [config[x]['url'] for x in download_keys],
    ))
    downloaded_files = dict(
        zip(
            urls.keys(),
            [config[x]['local_path'] for x in urls.keys()],
        ))

    workflow = Workflow()
    workflow.setobj(
        obj=mgd.TempOutputObj('url', 'files'),
        value=urls,
    )
    workflow.subworkflow(
        name='download',
        func=create_download_workflow,
        axes=('files', ),
        args=(
            mgd.TempInputObj('url', 'files'),
            mgd.TempOutputFile('download.file', 'files'),
        ),
    )
    workflow.transform(
        name='unzip',
        axes=('files', ),
        func=tasks.unzip,
        args=(
            mgd.TempInputFile('download.file', 'files'),
            mgd.OutputFile('unzipped', 'files', fnames=downloaded_files),
        ),
    )
    return workflow