Esempio n. 1
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
Esempio n. 2
0
def _create_download_cosmic_workflow(ref_data_version,
                                     out_file,
                                     user,
                                     password,
                                     host='sftp-cancer.sanger.ac.uk',
                                     local_download=False):

    host_base_path = '/files/{}/cosmic/v83/VCF'.format(
        ref_data_version.lower())

    coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz'])

    non_coding_host_path = '/'.join(
        [host_base_path, 'CosmicNonCodingVariants.vcf.gz'])

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'),
                    value=coding_host_path)

    workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'),
                    value=non_coding_host_path)

    workflow.subworkflow(name='download_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.subworkflow(name='download_non_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('non_coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('non_coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.transform(name='merge_files',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=([
                           mgd.TempInputFile('coding.vcf.gz'),
                           mgd.TempInputFile('non_coding.vcf.gz')
                       ], mgd.OutputFile(out_file)),
                       kwargs={
                           'allow_overlap': True,
                           'index_file': mgd.OutputFile(out_file + '.tbi')
                       })

    return workflow
Esempio n. 3
0
def destruct_preprocess_workflow(normal_bam_files,
                                 normal_stats,
                                 normal_reads_1,
                                 normal_reads_2,
                                 normal_sample_1,
                                 normal_sample_2,
                                 ref_data_directory,
                                 destruct_config,
                                 config,
                                 tag=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ctx={
                           'docker_image': config['docker']['destruct'],
                           'disk': 200
                       },
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    if isinstance(normal_bam_files, str):
        workflow.subworkflow(name='process_individual_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 config,
                                 mgd.InputFile(normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bam_files.keys()),
        )

        workflow.subworkflow(name='process_individual_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 config,
                                 mgd.InputFile('bam',
                                               'normal_cell_id',
                                               fnames=normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})

    return workflow
Esempio n. 4
0
def destruct_preprocess_workflow(normal_bam_files,
                                 normal_stats,
                                 normal_reads_1,
                                 normal_reads_2,
                                 normal_sample_1,
                                 normal_sample_2,
                                 ref_data_directory,
                                 destruct_config,
                                 tag=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    if isinstance(normal_bam_files, str):
        workflow.transform(
            name='bamdisc_normal',
            func=
            "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
            ctx={
                'io': 1,
                'mem': 8,
                'disk': 200
            },
            args=(
                mgd.TempInputObj("destruct_config"),
                mgd.InputFile(normal_bam_files),
                mgd.OutputFile(normal_stats),
                mgd.OutputFile(normal_reads_1),
                mgd.OutputFile(normal_reads_2),
                mgd.OutputFile(normal_sample_1),
                mgd.OutputFile(normal_sample_2),
                mgd.TempSpace('bamdisc_normal_tempspace'),
            ))
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bam_files.keys()),
        )

        workflow.subworkflow(name='process_normal_cells',
                             func=process_cells_destruct,
                             args=(
                                 mgd.TempInputObj("destruct_config"),
                                 mgd.InputFile('bam',
                                               'normal_cell_id',
                                               fnames=normal_bam_files),
                                 mgd.OutputFile(normal_reads_1),
                                 mgd.OutputFile(normal_reads_2),
                                 mgd.OutputFile(normal_sample_1),
                                 mgd.OutputFile(normal_sample_2),
                                 mgd.OutputFile(normal_stats),
                             ),
                             kwargs={'tag': tag})

    return workflow
Esempio n. 5
0
def _create_download_decompress_concat_workflow(urls,
                                                out_file,
                                                local_download=False):
    workflow = pypeliner.workflow.Workflow()

    local_files = []

    for i, url in enumerate(urls):
        local_files.append(mgd.TempFile('file_{}'.format(i)))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.subworkflow(name='download_file_{}'.format(i),
                             func=_create_download_decompress_workflow,
                             args=(
                                 mgd.TempInputObj('url_{}'.format(i)),
                                 local_files[i].as_output(),
                             ),
                             kwargs={'local_download': local_download})

    concat_args = [
        'cat',
    ] + [x.as_input()
         for x in local_files] + ['>', mgd.OutputFile(out_file)]

    workflow.commandline(name='concat', args=concat_args)

    return workflow
Esempio n. 6
0
def _create_download_decompress_workflow(url,
                                         local_path,
                                         local_download=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(mgd.TempOutputObj('url'), value=url)

    workflow.transform(
        name='download',
        ctx={'local': local_download},
        func=tasks.download,
        args=(
            mgd.TempInputObj('url'),
            mgd.TempOutputFile('download'),
        ),
    )

    workflow.transform(name='decompress',
                       func=tasks.decompress,
                       args=(
                           mgd.TempInputFile('download'),
                           mgd.OutputFile(local_path),
                       ))

    return workflow
Esempio n. 7
0
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('config', 'regions'),
        value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.commandline(
        name='run_mpileup',
        axes=('regions',),
        args=(
            'samtools',
            'mpileup',
            '-f', mgd.InputFile(ref_genome_fasta_file),
            '-o', mgd.TempOutputFile('region.mpileup', 'regions'),
            '-r', mgd.TempInputObj('config', 'regions'),
            mgd.InputFile(bam_file),
        )
    )

    workflow.transform(
        name='run_mpileup2snp',
        axes=('regions',),
        ctx=med_mem_ctx,
        func=tasks.mpileup2snp,
        args=(
            mgd.TempInputFile('region.mpileup', 'regions'),
            mgd.TempOutputFile('region.vcf', 'regions'),
        )
    )

    workflow.transform(
        name='compress',
        axes=('regions',),
        func=soil.wrappers.samtools.tasks.compress_vcf,
        args=(
            mgd.TempInputFile('region.vcf', 'regions'),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
        ),
    )

    workflow.transform(
        name='concatenate_vcfs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 8
0
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
Esempio n. 9
0
def create_vcf_mappability_annotation_workflow(
        mappability_file,
        vcf_file,
        out_file,
        chromosomes=default_chromosomes,
        split_size=int(1e7),
):

    ctx = {'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='get_regions',
        ret=mgd.TempOutputObj('regions_obj', 'regions'),
        ctx=ctx,
        func='biowrappers.components.variant_calling.utils.get_vcf_regions',
        args=(
            mgd.InputFile(vcf_file, extensions=['.tbi']),
            split_size,
        ),
        kwargs={
            'chromosomes': chromosomes,
        },
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('regions',),
        ctx=ctx,
        func='biowrappers.components.variant_calling.mappability.tasks.get_mappability',
        args=(
            mappability_file,
            mgd.InputFile(vcf_file, extensions=['.tbi']),
            mgd.TempOutputFile('mappability.csv.gz', 'regions')
        ),
        kwargs={
            'region': mgd.TempInputObj('regions_obj', 'regions'),
        },
    )

    workflow.transform(
        name='merge_tables',
        ctx=ctx,
        func='biowrappers.components.io.csv.tasks.concatenate_csv',
        args=(
            mgd.TempInputFile('mappability.csv.gz', 'regions'),
            mgd.OutputFile(out_file)
        )
    )

    return workflow
Esempio n. 10
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes=None,
                            split_size=int(1e7)):

    regions = utils.get_bam_regions(normal_bam_file,
                                    split_size,
                                    chromosomes=chromosomes)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=pypeliner.managed.TempOutputObj('config', 'regions'),
                    value=regions)

    workflow.transform(
        name='run_somatic',
        axes=('regions', ),
        ctx={
            'mem': 6,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=tasks.run_somatic,
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(tumour_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
            mgd.TempInputObj('config', 'regions'),
            mgd.TempSpace('varscan_tmp', 'regions'),
        ),
    )

    workflow.transform(
        name='merge',
        axes=(),
        ctx={
            'mem': 2,
            'mem_retry_increment': 2,
            'num_retry': 3
        },
        func=vcf_tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Esempio n. 11
0
def create_snv_allele_counts_workflow(
        bam_file,
        out_file,
        table_name,
        chromosomes=default_chromosomes,
        count_duplicates=False,
        min_bqual=0,
        min_mqual=0,
        report_non_variant_positions=True,
        report_zero_count_positions=False,
        split_size=int(1e7)):

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.TempOutputObj('regions_obj', 'regions'),
        value=biowrappers.components.variant_calling.utils.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.transform(
        name='get_counts',
        axes=('regions',),
        ctx=med_ctx,
        func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_region',
        args=(
            mgd.InputFile(bam_file),
            mgd.TempOutputFile('counts.h5', 'regions'),
            mgd.TempInputObj('regions_obj', 'regions'),
            table_name
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'report_non_variant_positions': report_non_variant_positions,
            'report_zero_count_positions': report_zero_count_positions
        }
    )

    workflow.transform(
        name='concatenate_counts',
        ctx=med_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(
            mgd.TempInputFile('counts.h5', 'regions'),
            mgd.OutputFile(out_file)
        )
    )

    return workflow
Esempio n. 12
0
def create_fit_model_workflow(
    experiment_filename,
    results_filename,
    config,
    ref_data_dir,
    tumour_id=None,
):
    config = remixt.config.get_sample_config(config, tumour_id)
    
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16})

    workflow.transform(
        name='init',
        func=remixt.analysis.pipeline.init,
        ret=mgd.TempOutputObj('init_params', 'init_id'),
        args=(
            mgd.TempOutputFile('init_results'),
            mgd.InputFile(experiment_filename),
            config,
        ),
    )

    workflow.transform(
        name='fit',
        axes=('init_id',),
        func=remixt.analysis.pipeline.fit_task,
        args=(
            mgd.TempOutputFile('fit_results', 'init_id'),
            mgd.InputFile(experiment_filename),
            mgd.TempInputObj('init_params', 'init_id'),
            config,
        ),
    )

    workflow.transform(
        name='collate',
        func=remixt.analysis.pipeline.collate,
        args=(
            mgd.OutputFile(results_filename),
            mgd.InputFile(experiment_filename),
            mgd.TempInputFile('init_results'),
            mgd.TempInputFile('fit_results', 'init_id'),
            config,
        ),
    )

    return workflow
Esempio n. 13
0
def _create_download_vep_plugins_workflow(urls, out_dir, local_download=False):
    workflow = pypeliner.workflow.Workflow()

    for i, url in enumerate(urls):
        out_file = os.path.join(out_dir, os.path.basename(url))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.transform(name='download_file_{}'.format(i),
                           ctx={'local': local_download},
                           func=tasks.download,
                           args=(
                               mgd.TempInputObj('url_{}'.format(i)),
                               mgd.OutputFile(out_file),
                           ))

    return workflow
Esempio n. 14
0
def create_mappability_annotation_workflow(
        in_vcf_file,
        out_csv_file,
        mappability_file,
        split_size=1e4
):
    workflow = pypeliner.workflow.Workflow(
        ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.workflows.mappability_annotation.tasks.get_vcf_regions",
        ret=mgd.TempOutputObj('regions_obj', 'regions'),
        args=(
            mgd.InputFile(in_vcf_file, extensions=['.tbi']),
            int(split_size),
        ),
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('regions',),
        func='single_cell.workflows.mappability_annotation.tasks.get_mappability',
        args=(
            mappability_file,
            mgd.InputFile(in_vcf_file, extensions=['.tbi']),
            mgd.TempOutputFile('mappability.csv.gz', 'regions', extensions=['.yaml'])
        ),
        kwargs={
            'region': mgd.TempInputObj('regions_obj', 'regions'),
        },
    )

    workflow.transform(
        name='merge_tables',
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('mappability.csv.gz', 'regions', extensions=['.yaml']),
            mgd.OutputFile(out_csv_file, extensions=['.yaml'])
        )
    )

    return workflow
Esempio n. 15
0
def ctDNA_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config'])
    for arg, value in args.iteritems():
        config[arg] = value

    helpers.makedirs(config["bam_directory"])

    helpers.makedirs(config["results_dir"])

    inputs = helpers.load_yaml(args['input_yaml'])
    patients = inputs.keys()

    workflow.setobj(obj=mgd.OutputChunks('patient_id', ), value=patients)

    workflow.transform(name='get_input_by_patient',
                       func=helpers.get_input_by_patient,
                       ret=mgd.TempOutputObj('patient_input', 'patient_id'),
                       axes=('patient_id', ),
                       args=(
                           inputs,
                           mgd.InputInstance('patient_id'),
                       ))

    workflow.subworkflow(name='patient_workflow',
                         func=patient_workflow,
                         axes=('patient_id', ),
                         args=(
                             config,
                             mgd.InputInstance('patient_id'),
                             mgd.TempInputObj('patient_input', 'patient_id'),
                             mgd.OutputFile(
                                 os.path.join(config['results_dir'],
                                              '{patient_id}.log'),
                                 'patient_id'),
                         ))

    pyp.run(workflow)
Esempio n. 16
0
def download_external_files(config):
    download_keys = [x for x in config if 'url' in config[x]]
    urls = dict(zip(
        download_keys,
        [config[x]['url'] for x in download_keys],
    ))
    downloaded_files = dict(
        zip(
            urls.keys(),
            [config[x]['local_path'] for x in urls.keys()],
        ))

    workflow = Workflow()
    workflow.setobj(
        obj=mgd.TempOutputObj('url', 'files'),
        value=urls,
    )
    workflow.subworkflow(
        name='download',
        func=create_download_workflow,
        axes=('files', ),
        args=(
            mgd.TempInputObj('url', 'files'),
            mgd.TempOutputFile('download.file', 'files'),
        ),
    )
    workflow.transform(
        name='unzip',
        axes=('files', ),
        func=tasks.unzip,
        args=(
            mgd.TempInputFile('download.file', 'files'),
            mgd.OutputFile('unzipped', 'files', fnames=downloaded_files),
        ),
    )
    return workflow
Esempio n. 17
0
def create_destruct_fastq_workflow(
    fastq1_filenames,
    fastq2_filenames,
    sample1_filenames,
    sample2_filenames,
    stats_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(fastq1_filenames.keys()),
    )

    workflow.transform(
        name='readstats',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.read_stats',
        ret=mgd.TempOutputObj('stats', 'bylibrary'),
        args=(
            mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames),
            config['fragment_length_num_stddevs'],
        ),
    )

    # Align a sample of reads and calculate alignment statistics

    workflow.transform(
        name='prepseed_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            36,
            mgd.TempOutputFile('sample.seed', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='bwtrealign_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('sample.seed', 'bylibrary'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_aligntrue',
            '-a',
            '-',
            '-1',
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            '-2',
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '-s',
            mgd.TempOutputFile('samples.align.true', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='scorestats',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.score_stats.create_score_stats',
        args=(
            mgd.TempInputFile('samples.align.true', 'bylibrary'),
            config['match_score'],
            mgd.TempOutputFile('score.stats', 'bylibrary'),
        ),
    )

    # Split discordant fastqs and align

    workflow.transform(
        name='splitfastq1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads1', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='splitfastq2',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads2', 'bylibrary', 'byread',
                               axes_origin=[]),
        ),
    )

    workflow.transform(
        name='prepseed',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            36,
            mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'),
        ),
    )

    workflow.commandline(
        name='bwtrealign',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_realign2',
            '-l',
            mgd.TempInputObj('library_id', 'bylibrary'),
            '-a',
            '-',
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--tchimer',
            config['chimeric_threshold'],
            '--talign',
            config['alignment_threshold'],
            '--pchimer',
            config['chimeric_prior'],
            '--tvalid',
            config['readvalid_threshold'],
            '-z',
            mgd.TempInputFile('score.stats', 'bylibrary'),
            '--span',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'),
            '--split',
            mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='merge_spanning_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='filterreads',
        axes=('bylibrary', ),
        ctx=lowmem,
        args=(
            'destruct_filterreads',
            '-n',
            '2',
            '-a',
            mgd.TempInputFile('spanning.alignments_1', 'bylibrary'),
            '-r',
            config['satellite_regions'],
            '>',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('split.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_spanning_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary'),
            mgd.TempOutputFile('spanning.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary'),
            mgd.TempOutputFile('split.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    # Cluster spanning reads

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom.args', 'bychromarg'),
        value=destruct.tasks.generate_chromosome_args(config['chromosomes']),
    )

    workflow.transform(
        name='write_stats_table',
        ctx=lowmem,
        func='destruct.tasks.write_stats_table',
        args=(
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.TempInputObj('stats', 'bylibrary'),
            mgd.TempOutputFile('libstats.tsv'),
        ),
    )

    workflow.commandline(
        name='cluster',
        axes=('bychromarg', ),
        ctx=medmem,
        args=(
            'destruct_mclustermatepairs',
            '-a',
            mgd.TempInputFile('spanning.alignments'),
            '-s',
            mgd.TempInputFile('libstats.tsv'),
            '-c',
            mgd.TempOutputFile('clusters', 'bychromarg'),
            mgd.TempInputObj('chrom.args', 'bychromarg'),
            '--clustmin',
            config['cluster_readcount_threshold'],
            '--fragmax',
            config['fragment_length_max'],
        ),
    )

    # Predict breakpoints from split reads

    workflow.transform(
        name='predict_breaks',
        axes=('bychromarg', ),
        ctx=medmem,
        func='destruct.predict_breaks.predict_breaks',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('spanning.alignments'),
            mgd.TempInputFile('split.alignments'),
            mgd.TempOutputFile('breakpoints_2', 'bychromarg'),
        ),
    )

    workflow.transform(
        name='merge_clusters',
        ctx=lowmem,
        func='destruct.tasks.merge_clusters',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('breakpoints_2', 'bychromarg'),
            mgd.TempOutputFile('clusters'),
            mgd.TempOutputFile('breakpoints_2'),
            mgd.TempOutputFile('merge_clusters.debug'),
        ),
    )

    # Realign reads to breakpoints

    workflow.commandline(
        name='realigntobreaks',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'destruct_realigntobreaks2',
            '-r',
            config['genome_fasta'],
            '-b',
            mgd.TempInputFile('breakpoints_2'),
            '-c',
            mgd.TempInputFile('clusters'),
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--span',
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '--realignments',
            mgd.TempOutputFile('realignments', 'bylibrary', 'byread'),
        ),
    )

    # Calculate likelihoods based on realignments

    workflow.transform(
        name='calculate_realignment_likelihoods',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.predict_breaks.calculate_realignment_likelihoods',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempInputFile('realignments', 'bylibrary', 'byread'),
            mgd.TempInputFile('score.stats', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'),
            config['match_score'],
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_mean'),
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_stddev'),
        ),
    )

    workflow.transform(
        name='merge_likelihoods_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary'),
            mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'),
            '1',
        ),
    )

    workflow.transform(
        name='merge_likelihoods_2',
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2'),
            mgd.TempSpace('merge_likelihoods_2_temp'),
            '1',
        ),
    )

    # Set cover for multi mapping reads

    workflow.transform(
        name='calc_weights',
        ctx=medmem,
        func='destruct.predict_breaks.calculate_cluster_weights',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('cluster_weights'),
        ),
    )

    workflow.commandline(
        name='setcover',
        ctx=medmem,
        args=(
            'destruct_setcover',
            '-c',
            mgd.TempInputFile('clusters'),
            '-w',
            mgd.TempInputFile('cluster_weights'),
            '-a',
            mgd.TempOutputFile('clusters_setcover'),
        ),
    )

    # Select cluster based on setcover

    workflow.transform(
        name='select_clusters',
        ctx=medmem,
        func='destruct.predict_breaks.select_clusters',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('breakpoints_1'),
            mgd.TempInputFile('likelihoods_2'),
            mgd.TempOutputFile('likelihoods_1'),
        ),
    )

    # Select prediction based on max likelihood

    workflow.transform(
        name='select_predictions',
        ctx=himem,
        func='destruct.predict_breaks.select_predictions',
        args=(
            mgd.TempInputFile('breakpoints_1'),
            mgd.TempOutputFile('breakpoints'),
            mgd.TempInputFile('likelihoods_1'),
            mgd.TempOutputFile('likelihoods'),
            config['mate_score_threshold'],
            config['template_length_min_threshold'],
            config['min_alignment_log_likelihood'],
        ),
    )

    # Optionally tabulate supporting reads

    workflow.transform(
        name='tabreads',
        ctx=medmem,
        func='destruct.tasks.tabulate_reads',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            mgd.TempOutputFile('breakreads.table.unsorted'),
        ),
    )

    workflow.commandline(
        name='sortreads',
        ctx=medmem,
        args=(
            'sort',
            '-n',
            mgd.TempInputFile('breakreads.table.unsorted'),
            '>',
            mgd.OutputFile(breakpoint_read_table),
        ),
    )

    # Tabulate results

    workflow.transform(
        name='tabulate',
        ctx=himem,
        func='destruct.tasks.tabulate_results',
        args=(
            mgd.TempInputFile('breakpoints'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            config['genome_fasta'],
            config['gtf_filename'],
            config['dgv_filename'],
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
        ),
    )

    return workflow
Esempio n. 18
0
def create_remixt_workflow(
    tumour_path,
    normal_path,
    breakpoints,
    sample_id,
    remixt_results_filename,
    remixt_brk_cn_csv,
    remixt_cn_csv,
    remixt_minor_modes_csv,
    remixt_mix_csv,
    remixt_read_depth_csv,
    remixt_stats_csv,
    remixt_refdata,
    reference,
    single_node=False,
):
    ctx = {'docker_image': config.containers('wgs')}

    params = config.default_params('copynumber_calling')['remixt']

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    remixt_config = {
        'genome_fasta': reference,
        'genome_fai': reference + '.fai',
    }

    if breakpoints is None:
        workflow.setobj(
            obj=mgd.TempOutputObj('emptybreakpoints'),
            value=[],
        )

        workflow.transform(
            name='write_empty_breakpoints',
            func='wgs.workflows.remixt.tasks.write_empty_breakpoints',
            args=(
                mgd.TempInputObj('emptybreakpoints'),
                mgd.TempOutputFile('filtered_breakpoints.csv'),
            ),
        )

    else:
        workflow.transform(
            name='filter_breakpoints',
            func='wgs.workflows.remixt.tasks.filter_destruct_breakpoints',
            ctx=helpers.get_default_ctx(memory=4, walltime='4:00'),
            args=(mgd.InputFile(breakpoints),
                  mgd.TempOutputFile('filtered_breakpoints.csv'),
                  params['min_num_reads']))

    if single_node:
        workflow.transform(
            name='remixt',
            func='wgs.workflows.remixt.tasks.run_remixt_local',
            ctx=helpers.get_default_ctx(memory=15, walltime='120:00', ncpus=8),
            args=(
                mgd.TempSpace("remixt_temp"),
                mgd.TempInputFile('filtered_breakpoints.csv'),
                mgd.InputFile(tumour_path, extensions=['.bai']),
                mgd.InputFile(normal_path, extensions=['.bai']),
                sample_id,
                mgd.OutputFile(remixt_results_filename),
                mgd.TempSpace('remixt_raw_dir'),
                remixt_config,
                remixt_refdata,
            ),
        )
    else:
        workflow.subworkflow(name='remixt',
                             func="remixt.workflow.create_remixt_bam_workflow",
                             ctx={
                                 'docker_image': config.containers('remixt'),
                                 'walltime': '48:00'
                             },
                             args=(
                                 mgd.TempInputFile('filtered_breakpoints.csv'),
                                 {
                                     sample_id:
                                     mgd.InputFile(tumour_path,
                                                   extensions=['.bai']),
                                     sample_id + 'N':
                                     mgd.InputFile(normal_path,
                                                   extensions=['.bai'])
                                 },
                                 {
                                     sample_id:
                                     mgd.OutputFile(remixt_results_filename)
                                 },
                                 mgd.TempSpace('remixt_raw_dir'),
                                 remixt_config,
                                 remixt_refdata,
                             ),
                             kwargs={
                                 'normal_id': sample_id + 'N',
                             })

    workflow.transform(
        name='parse_remixt',
        func='wgs.workflows.remixt.tasks.parse_remixt_file',
        args=(mgd.InputFile(remixt_results_filename), [
            mgd.OutputFile(remixt_brk_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_cn_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_minor_modes_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_mix_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_read_depth_csv, extensions=['.yaml']),
            mgd.OutputFile(remixt_stats_csv, extensions=['.yaml']),
        ], ['/brk_cn', '/cn', '/minor_modes', '/mix', '/read_depth',
            '/stats'], mgd.TempSpace('tempdir_parse')))

    return workflow
Esempio n. 19
0
def analyze_tumour_normal(config, input_args, results_dir, normal_bam,
                          tumour_sample, tumour_bam, snv_tsv, indel_tsv,
                          snv_vcf, indel_vcf):
    workflow = pypeliner.workflow.Workflow()

    matched_results_dir = os.path.join(results_dir, tumour_sample)

    helpers.makedirs(matched_results_dir)

    workflow.subworkflow(name='run_deepSNV',
                         func=deepSNV.run_deepSNV,
                         args=(config, mgd.InputFile(normal_bam),
                               mgd.InputFile(tumour_bam),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'deepSNV_out.tsv'))))

    workflow.subworkflow(name='run_VarScan',
                         func=VarScan.run_VarScan,
                         args=(
                             config,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'VarScan_out.vcf')),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'VarScan_indel_out.vcf')),
                         ))

    workflow.subworkflow(name='run_MutationSeq',
                         func=MutationSeq.run_MutationSeq,
                         args=(
                             config,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'museq_out.vcf')),
                         ))

    workflow.subworkflow(name='run_Strelka',
                         func=Strelka.run_Strelka,
                         args=(config, mgd.InputFile(normal_bam),
                               mgd.InputFile(tumour_bam),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'strelka_out.vcf')),
                               mgd.OutputFile(
                                   os.path.join(matched_results_dir,
                                                'strelka_indel_out.vcf'))))

    workflow.subworkflow(name='run_LoLoPicker',
                         func=LoLoPicker.run_LoLoPicker,
                         args=(
                             config,
                             input_args,
                             mgd.InputFile(normal_bam),
                             mgd.InputFile(tumour_bam),
                             mgd.OutputFile(
                                 os.path.join(matched_results_dir,
                                              'LoLoPicker_out.tsv')),
                         ))

    workflow.transform(
        name='create_result_dict',
        func=union.create_result_dict,
        ret=mgd.TempOutputObj('result_dict'),
        args=(
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'deepSNV_out.tsv')),
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'VarScan_out.vcf')),
            mgd.InputFile(os.path.join(matched_results_dir, 'museq_out.vcf')),
            mgd.InputFile(os.path.join(matched_results_dir,
                                       'strelka_out.vcf')),
            mgd.InputFile(
                os.path.join(matched_results_dir, 'LoLoPicker_out.tsv')),
        ))

    workflow.transform(name='union_results',
                       func=union.union_results,
                       args=(
                           config,
                           mgd.InputFile(normal_bam),
                           mgd.InputFile(tumour_bam),
                           mgd.TempInputObj('result_dict'),
                           mgd.TempSpace('union_space'),
                           mgd.OutputFile(snv_tsv),
                           mgd.OutputFile(snv_vcf),
                       ))

    workflow.transform(name='union_indels',
                       func=union.union_indels,
                       args=(
                           config,
                           mgd.InputFile(
                               os.path.join(matched_results_dir,
                                            'strelka_indel_out.vcf')),
                           mgd.InputFile(
                               os.path.join(matched_results_dir,
                                            'VarScan_indel_out.vcf')),
                           mgd.OutputFile(indel_tsv),
                           mgd.OutputFile(indel_vcf),
                       ))

    return workflow
Esempio n. 20
0
def create_somatic_workflow(normal_bam_file,
                            tumour_bam_file,
                            ref_genome_fasta_file,
                            out_file,
                            chromosomes='default',
                            is_exome=False,
                            split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(
        ['bcftools', 'samtools', 'strelka'])

    workflow = pypeliner.workflow.Workflow(default_ctx=med_mem_ctx,
                                           default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('config', 'regions'),
                    value=soil.utils.genome.get_bam_regions(
                        normal_bam_file, split_size, chromosomes=chromosomes))

    workflow.setobj(obj=mgd.TempOutputObj('chrom_names', 'chrom_axis'),
                    value=get_chromosomes(normal_bam_file,
                                          chromosomes=chromosomes))

    workflow.transform(
        name='count_fasta_bases',
        func=soil.wrappers.strelka.tasks.count_fasta_bases,
        args=(
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('ref_base_counts.tsv'),
        ),
    )

    workflow.transform(
        name='get_genome_size',
        ctx={'local': True},
        func=get_known_genome_size,
        ret=mgd.TempOutputObj('genome_size'),
        args=(
            mgd.InputFile(tumour_bam_file),
            mgd.TempInputFile('ref_base_counts.tsv'),
            chromosomes,
        ),
        sandbox=None,
    )

    workflow.transform(
        name='get_chromosome_depths',
        axes=('chrom_axis', ),
        func=soil.wrappers.strelka.tasks.get_chromosome_depth,
        args=(
            mgd.TempInputObj('chrom_names', 'chrom_axis'),
            mgd.InputFile(normal_bam_file),
            mgd.InputFile(ref_genome_fasta_file),
            mgd.TempOutputFile('chrom_depth.txt', 'chrom_axis'),
        ),
    )

    workflow.transform(
        name='merge_chromosome_depths',
        func=soil.wrappers.strelka.tasks.merge_chromosome_depth,
        args=(
            mgd.TempInputFile('chrom_depth.txt', 'chrom_axis'),
            mgd.TempOutputFile('chrom_depth_merged.txt'),
        ),
        sandbox=None,
    )

    workflow.transform(name='call_genome_segment',
                       axes=('regions', ),
                       func=soil.wrappers.strelka.tasks.call_genome_segment,
                       args=(
                           mgd.TempInputFile('chrom_depth_merged.txt'),
                           mgd.InputFile(normal_bam_file),
                           mgd.InputFile(tumour_bam_file),
                           mgd.InputFile(ref_genome_fasta_file),
                           mgd.TempOutputFile('indels.vcf', 'regions'),
                           mgd.TempOutputFile('snvs.vcf', 'regions'),
                           mgd.TempSpace('call_genome_segment_tmp', 'regions'),
                           mgd.TempInputObj('config', 'regions'),
                           mgd.TempInputObj('genome_size'),
                       ),
                       kwargs={
                           'is_exome': is_exome,
                       })

    workflow.transform(
        name='merge_indels',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('indels.vcf', 'regions'),
            mgd.TempOutputFile('indels.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_snvs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('snvs.vcf', 'regions'),
            mgd.TempOutputFile('snvs.vcf.gz'),
        ),
    )

    workflow.transform(
        name='merge_all',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            [
                mgd.TempInputFile('indels.vcf.gz'),
                mgd.TempInputFile('snvs.vcf.gz')
            ],
            mgd.TempOutputFile('merged.vcf.gz'),
        ),
        kwargs={
            'allow_overlap': True,
        },
    )

    workflow.commandline(name='filter_vcf',
                         ctx=low_mem_ctx,
                         args=(
                             'bcftools',
                             'view',
                             '-O',
                             'z',
                             '-f',
                             '.,PASS',
                             '-o',
                             mgd.OutputFile(out_file),
                             mgd.TempInputFile('merged.vcf.gz'),
                         ))

    workflow.transform(name='index_vcf',
                       ctx=low_mem_ctx,
                       func=soil.wrappers.samtools.tasks.index_vcf,
                       args=(
                           mgd.InputFile(out_file),
                           mgd.OutputFile(out_file + '.tbi'),
                       ))

    return workflow
Esempio n. 21
0
def process_cells_destruct(destruct_config,
                           cell_bam_files,
                           reads_1,
                           reads_2,
                           sample_1,
                           sample_2,
                           stats,
                           tag=False):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
    }

    cells = list(cell_bam_files.keys())

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cells,
    )

    workflow.transform(
        name='bamdisc_and_numreads_cell',
        func=
        "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
        axes=('cell_id', ),
        ctx={
            'io': 1,
            'mem': 8
        },
        ret=mgd.TempOutputObj("numreads", "cell_id"),
        args=(
            destruct_config,
            mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files),
            mgd.TempOutputFile('cell_stats', 'cell_id'),
            mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'),
        ),
    )

    workflow.transform(
        name='merge_read_counts',
        ret=mgd.TempOutputObj("readcounts"),
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_read_counts",
        ctx={
            'io': 1,
            'mem': 8
        },
        args=(mgd.TempInputObj('numreads', 'cell_id'), ))

    workflow.transform(
        name='reindex_reads',
        func=
        "single_cell.workflows.destruct_singlecell.tasks.re_index_reads_both",
        ctx={
            'io': 1,
            'mem': 8
        },
        axes=('cell_id', ),
        args=(
            mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'),
            mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'),
            mgd.InputInstance('cell_id'),
            cells,
            mgd.TempInputObj('readcounts'),
        ),
        kwargs={'tag': tag})

    workflow.transform(
        name='merge_reads_r1',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_1),
        ),
    )

    workflow.transform(
        name='merge_reads_r2',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_2),
        ),
    )

    workflow.transform(
        name='merge_sample',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs",
        args=(
            mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.OutputFile(sample_1),
            mgd.OutputFile(sample_2),
            destruct_config['num_read_samples'],
        ),
    )

    workflow.transform(
        name='merge_stats',
        ctx={
            'io': 1,
            'mem': 8
        },
        func="single_cell.workflows.destruct_singlecell.tasks.merge_stats",
        args=(
            mgd.TempInputFile('cell_stats', 'cell_id'),
            mgd.OutputFile(stats),
        ),
    )

    return workflow
Esempio n. 22
0
def create_multiple_lane_align_workflow(fastq_files_1,
                                        fastq_files_2,
                                        ref_genome_dir,
                                        out_bam_file,
                                        add_xs_tag=False,
                                        align_threads=1,
                                        merge_threads=1,
                                        read_group_info=None,
                                        sort_threads=1):

    if read_group_info is None:
        read_group_info = {}

        for key in fastq_files_1:
            read_group_info[key] = None

    sandbox = soil.utils.workflow.get_sandbox(['sambamba', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('read_group_info', 'lane'),
                    value=read_group_info)

    workflow.subworkflow(name='align',
                         axes=('lane', ),
                         func=create_align_workflow,
                         args=(
                             mgd.InputFile('R1.fq.gz',
                                           'lane',
                                           fnames=fastq_files_1),
                             mgd.InputFile('R2.fq.gz',
                                           'lane',
                                           fnames=fastq_files_2),
                             ref_genome_dir,
                             mgd.TempOutputFile('lane.bam', 'lane'),
                         ),
                         kwargs={
                             'add_xs_tag':
                             add_xs_tag,
                             'align_threads':
                             align_threads,
                             'read_group_info':
                             mgd.TempInputObj('read_group_info', 'lane'),
                             'sort_threads':
                             sort_threads,
                         })

    workflow.transform(name='markdups_and_merge',
                       axes=(),
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': merge_threads
                       },
                       func=soil.wrappers.sambamba.tasks.markdups,
                       args=(
                           mgd.TempInputFile('lane.bam', 'lane'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('markdup_tmp'),
                       ),
                       kwargs={
                           'threads': merge_threads,
                       })

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))
    return workflow
Esempio n. 23
0
def create_somatic_calling_workflow(samples,
                                    tumours,
                                    normals,
                                    museq_vcf,
                                    museq_maf,
                                    museq_paired_pdf,
                                    strelka_snv_vcf,
                                    strelka_snv_maf,
                                    strelka_indel_vcf,
                                    strelka_indel_maf,
                                    mutect_vcf,
                                    mutect_maf,
                                    somatic_consensus_maf,
                                    refdir,
                                    normal_ids,
                                    tumour_ids,
                                    single_node=False,
                                    is_exome=False):
    strelka_snv_vcf = dict([(sampid, strelka_snv_vcf[sampid])
                            for sampid in samples])
    strelka_indel_vcf = dict([(sampid, strelka_indel_vcf[sampid])
                              for sampid in samples])
    strelka_snv_maf = dict([(sampid, strelka_snv_maf[sampid])
                            for sampid in samples])
    strelka_indel_maf = dict([(sampid, strelka_indel_maf[sampid])
                              for sampid in samples])

    museq_vcf = dict([(sampid, museq_vcf[sampid]) for sampid in samples])
    museq_maf = dict([(sampid, museq_maf[sampid]) for sampid in samples])
    museq_paired_pdf = dict([(sampid, museq_paired_pdf[sampid])
                             for sampid in samples])

    mutect_vcf = dict([(sampid, mutect_vcf[sampid]) for sampid in samples])
    mutect_maf = dict([(sampid, mutect_maf[sampid]) for sampid in samples])

    somatic_consensus_maf = dict([(sampid, somatic_consensus_maf[sampid])
                                  for sampid in samples])

    chromosomes = config.refdir_data(refdir)['params']['chromosomes']
    paths_refdir = config.refdir_data(refdir)['paths']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.setobj(obj=mgd.TempOutputObj('normal_id',
                                          'sample_id',
                                          axes_origin=[]),
                    value={v: normal_ids[v]
                           for v in samples})

    workflow.setobj(obj=mgd.TempOutputObj('tumour_id',
                                          'sample_id',
                                          axes_origin=[]),
                    value={v: tumour_ids[v]
                           for v in samples})

    workflow.subworkflow(
        name="mutationseq_paired",
        func='wgs.workflows.mutationseq.create_museq_workflow',
        axes=('sample_id', ),
        args=(
            mgd.OutputFile('museq_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=museq_vcf),
            mgd.OutputFile('museq_snv_ann.maf', 'sample_id', fnames=museq_maf),
            mgd.OutputFile('museq_paired_pdf',
                           'sample_id',
                           fnames=museq_paired_pdf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
        ),
        kwargs={
            'normal_id':
            mgd.TempInputObj('normal_id', 'sample_id'),
            'tumour_id':
            mgd.TempInputObj('tumour_id', 'sample_id'),
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node,
        })

    workflow.subworkflow(
        name="strelka",
        func='wgs.workflows.strelka.create_strelka_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('normal_bam',
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai']),
            mgd.InputFile('tumour_bam',
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai']),
            mgd.OutputFile('strelka_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=strelka_snv_vcf),
            mgd.OutputFile('strelka_snv_ann.maf',
                           'sample_id',
                           fnames=strelka_snv_maf),
            mgd.OutputFile('strelka_indel_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=strelka_indel_vcf),
            mgd.OutputFile('strelka_indel_ann.maf',
                           'sample_id',
                           fnames=strelka_indel_maf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
            'is_exome': is_exome
        },
    )

    workflow.subworkflow(
        name="mutect",
        func='wgs.workflows.mutect.create_mutect_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('normal_bam',
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai']),
            mgd.InputFile('tumour_bam',
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai']),
            mgd.OutputFile('mutect_snv_ann.vcf.gz',
                           'sample_id',
                           extensions=['.csi', '.tbi'],
                           fnames=mutect_vcf),
            mgd.OutputFile('mutect_snv_ann.maf',
                           'sample_id',
                           fnames=mutect_maf),
            paths_refdir['reference'],
            paths_refdir['reference_vep'],
            chromosomes,
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
        kwargs={
            'single_node': single_node,
        },
    )

    workflow.subworkflow(
        name="somatic_consensus",
        func=
        'wgs.workflows.somatic_calling_consensus.create_somatic_consensus_workflow',
        axes=('sample_id', ),
        args=(
            mgd.InputFile('mutect_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=mutect_vcf),
            mgd.InputFile('strelka_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=strelka_snv_vcf),
            mgd.InputFile('strelka_indel_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=strelka_indel_vcf),
            mgd.InputFile('museq_snv_ann.vcf.gz',
                          'sample_id',
                          extensions=['.csi', '.tbi'],
                          fnames=museq_vcf),
            mgd.OutputFile("somatic_consensus.maf",
                           'sample_id',
                           fnames=somatic_consensus_maf),
            chromosomes,
            paths_refdir['reference_vep'],
            mgd.TempInputObj('normal_id', 'sample_id'),
            mgd.TempInputObj('tumour_id', 'sample_id'),
        ),
    )

    return workflow
Esempio n. 24
0
def realign_bam_files(inputs,
                      outputs,
                      metrics_output,
                      metrics_tar,
                      refdir,
                      samples,
                      single_node=False,
                      ignore_bamtofastq_exception=False,
                      picard_mem=8):
    inputs = dict([(sample, inputs[sample]) for sample in samples])
    outputs = dict([(sample, outputs[sample]) for sample in samples])
    outputs_tdf = dict([(sample, outputs[sample] + '.tdf')
                        for sample in samples])

    metrics_output = dict([(sample, metrics_output[sample])
                           for sample in samples])
    metrics_tar = dict([(sample, metrics_tar[sample]) for sample in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.transform(name='bam_to_fastq',
                       ctx=helpers.get_default_ctx(walltime='96:00', disk=500),
                       func="wgs.workflows.realignment.tasks.split_by_rg",
                       axes=('sample_id', ),
                       args=(mgd.InputFile('input.bam',
                                           'sample_id',
                                           fnames=inputs),
                             mgd.TempOutputFile("inputdata_read1.fastq.gz",
                                                'sample_id', "readgroup"),
                             mgd.TempOutputFile("inputdata_read2.fastq.gz",
                                                'sample_id',
                                                "readgroup",
                                                axes_origin=[]),
                             mgd.TempSpace("bamtofastq", 'sample_id'),
                             ignore_bamtofastq_exception))

    workflow.transform(name='get_sample_info',
                       func="wgs.workflows.realignment.tasks.get_read_group",
                       axes=('sample_id', ),
                       ret=mgd.TempOutputObj('sample_info', 'sample_id'),
                       args=(mgd.InputFile('input.bam',
                                           'sample_id',
                                           fnames=inputs), ))

    workflow.subworkflow(name='align_samples',
                         func=alignment.align_samples,
                         args=(mgd.TempInputFile("inputdata_read1.fastq.gz",
                                                 "sample_id",
                                                 "readgroup",
                                                 axes_origin=[]),
                               mgd.TempInputFile("inputdata_read2.fastq.gz",
                                                 "sample_id",
                                                 "readgroup",
                                                 axes_origin=[]),
                               mgd.OutputFile('output.bam',
                                              'sample_id',
                                              fnames=outputs,
                                              extensions=['.bai'],
                                              axes_origin=[]),
                               mgd.OutputFile('output_metrics.csv',
                                              'sample_id',
                                              fnames=metrics_output,
                                              extensions=['.yaml'],
                                              axes_origin=[]),
                               mgd.OutputFile('output_metrics.tar',
                                              'sample_id',
                                              fnames=metrics_tar,
                                              axes_origin=[]),
                               mgd.OutputFile('output.bam.tdf',
                                              'sample_id',
                                              fnames=outputs_tdf,
                                              axes_origin=[]),
                               mgd.TempInputObj('sample_info',
                                                'sample_id',
                                                axes_origin=[]), refdir),
                         kwargs={
                             'single_node': single_node,
                             'picard_mem': picard_mem
                         })

    return workflow
Esempio n. 25
0
def create_ref_panel_phase_workflow(genetic_map_file, ref_file, target_file, out_file):
    """ Run EAGLE using a reference panel.
    """

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'eagle'])

    workflow = pypeliner.workflow.Workflow(default_ctx=default_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom', 'chrom'),
        value=get_chromosomes(target_file)
    )

    workflow.transform(
        name='split_ref',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(ref_file),
            mgd.TempOutputFile('ref.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='split_target',
        axes=('chrom',),
        func=tasks.get_chrom_variant_file,
        args=(
            mgd.TempInputObj('chrom', 'chrom'),
            mgd.InputFile(target_file),
            mgd.TempOutputFile('target.bcf', 'chrom')
        )
    )

    workflow.transform(
        name='run_eagle',
        axes=('chrom',),
        func=tasks.run_eagle,
        args=(
            mgd.InputFile(genetic_map_file),
            mgd.TempInputFile('ref.bcf', 'chrom'),
            mgd.TempInputFile('target.bcf', 'chrom'),
            mgd.TempOutputFile('phased.bcf', 'chrom'),
            mgd.TempSpace('eagle_tmp', 'chrom')
        )
    )

    workflow.transform(
        name='concat_results',
        func=tasks.concat_results,
        args=(
            mgd.TempInputFile('phased.bcf', 'chrom'),
            mgd.OutputFile(out_file)
        )
    )

    workflow.commandline(
        name='index',
        args=(
            'bcftools',
            'index',
            '-t',
            '-o', mgd.OutputFile(out_file + '.tbi'),
             mgd.InputFile(out_file)
        )
    )

    return workflow
Esempio n. 26
0
def create_resample_simulation_workflow(
    sim_defs,
    mixture_filename,
    source_filename,
    normal_filename,
    tumour_filename,
    breakpoint_filename,
    config,
    ref_data_dir,
):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(
        obj=mgd.TempOutputObj('sim_defs'),
        value=sim_defs,
    )

    workflow.transform(
        name='simulate_germline_alleles',
        ctx={'mem': 8},
        func=remixt.simulations.pipeline.simulate_germline_alleles,
        args=(
            mgd.TempOutputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
            config,
            ref_data_dir,
        ),
    )

    workflow.transform(
        name='resample_normal_data',
        ctx={'mem': 128},
        func=remixt.simulations.pipeline.resample_normal_data,
        args=(
            mgd.OutputFile(normal_filename),
            mgd.InputFile(source_filename),
            mgd.InputFile(mixture_filename),
            mgd.TempInputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
        ),
    )

    workflow.transform(
        name='resample_tumour_data',
        ctx={'mem': 128},
        func=remixt.simulations.pipeline.resample_tumour_data,
        args=(
            mgd.OutputFile(tumour_filename),
            mgd.InputFile(source_filename),
            mgd.InputFile(mixture_filename),
            mgd.TempInputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
        ),
    )

    workflow.transform(
        name='write_breakpoints',
        func=remixt.simulations.pipeline.write_breakpoints,
        args=(
            mgd.OutputFile(breakpoint_filename),
            mgd.InputFile(mixture_filename),
        ),
    )

    return workflow
Esempio n. 27
0
def create_calc_bias_workflow(
    tumour_seqdata_filename,
    segment_filename,
    segment_length_filename,
    config,
    ref_data_dir,
):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.transform(
        name='calc_fragment_stats',
        ctx={'mem': 16},
        func=remixt.analysis.stats.calculate_fragment_stats,
        ret=mgd.TempOutputObj('fragstats'),
        args=(
            mgd.InputFile(tumour_seqdata_filename),
            config,
        )
    )

    workflow.transform(
        name='sample_gc',
        ctx={'mem': 16},
        func=remixt.analysis.gcbias.sample_gc,
        args=(
            mgd.TempOutputFile('gcsamples.tsv'),
            mgd.InputFile(tumour_seqdata_filename),
            mgd.TempInputObj('fragstats').prop('fragment_mean'),
            config,
            ref_data_dir,
        )
    )

    workflow.transform(
        name='gc_lowess',
        ctx={'mem': 16},
        func=remixt.analysis.gcbias.gc_lowess,
        args=(
            mgd.TempInputFile('gcsamples.tsv'),
            mgd.TempOutputFile('gcloess.tsv'),
            mgd.TempOutputFile('gctable.tsv'),
        )
    )

    workflow.transform(
        name='split_segments',
        func=remixt.utils.split_table,
        args=(
            mgd.TempOutputFile('segments.tsv', 'segment_rows_idx'),
            mgd.InputFile(segment_filename),
            100,
        ),
    )

    workflow.transform(
        name='gc_map_bias',
        axes=('segment_rows_idx',),
        ctx={'mem': 16},
        func=remixt.analysis.gcbias.gc_map_bias,
        args=(
            mgd.TempInputFile('segments.tsv', 'segment_rows_idx'),
            mgd.TempInputObj('fragstats').prop('fragment_mean'),
            mgd.TempInputObj('fragstats').prop('fragment_stddev'),
            mgd.TempInputFile('gcloess.tsv'),
            mgd.TempOutputFile('biases.tsv', 'segment_rows_idx'),
            config,
            ref_data_dir,
        )
    )

    workflow.transform(
        name='merge_biases',
        func=remixt.utils.merge_tables,
        args=(
            mgd.TempOutputFile('biases.tsv'),
            mgd.TempInputFile('biases.tsv', 'segment_rows_idx'),
        ),
    )

    workflow.transform(
        name='biased_length',
        func=remixt.analysis.gcbias.biased_length,
        args=(
            mgd.OutputFile(segment_length_filename),
            mgd.TempInputFile('biases.tsv'),
        ),
    )

    return workflow
Esempio n. 28
0
def create_titan_workflow(normal_bam_file,
                          tumour_bam_file,
                          dbsnp_vcf_file,
                          mappability_file,
                          ref_genome_fasta_file,
                          out_file,
                          exome_bed_file=None,
                          sample='Tumour',
                          threads=1):

    sandbox = soil.utils.workflow.get_sandbox(
        ['hmmcopy', 'hmmcopy_utils', 'titan'])

    sandbox.channels.append('conda-forge')

    sandbox.packages.extend(['pandas', 'rpy2'])

    chromosomes = soil.utils.genome.load_bam_chromosome_lengths(
        normal_bam_file, 'autosomes')

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('init_params', 'param_idx'),
                    value=tasks.create_intialization_parameters())

    workflow.subworkflow(name='get_allele_counts',
                         func=create_allele_counts_workflow,
                         args=(mgd.InputFile(normal_bam_file),
                               mgd.InputFile(tumour_bam_file),
                               mgd.InputFile(dbsnp_vcf_file),
                               mgd.InputFile(ref_genome_fasta_file),
                               mgd.TempOutputFile('allele_counts.tsv')),
                         kwargs={'chromosomes': 'autosomes'})

    workflow.commandline(name='build_normal_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(normal_bam_file), '>',
                               mgd.TempOutputFile('normal.wig')))

    workflow.commandline(name='build_tumour_wig',
                         args=('readCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(tumour_bam_file), '>',
                               mgd.TempOutputFile('tumour.wig')))

    workflow.commandline(name='build_gc_wig',
                         args=('gcCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(ref_genome_fasta_file), '>',
                               mgd.TempOutputFile('gc.wig')))

    workflow.commandline(name='build_mappability_wig',
                         args=('mapCounter', '-c', ','.join(chromosomes),
                               mgd.InputFile(mappability_file), '>',
                               mgd.TempOutputFile('mappability.wig')))

    workflow.transform(name='build_coverage_file',
                       func=tasks.build_coverage_file,
                       args=(mgd.TempInputFile('normal.wig'),
                             mgd.TempInputFile('tumour.wig'),
                             mgd.TempInputFile('gc.wig'),
                             mgd.TempInputFile('mappability.wig'),
                             mgd.TempOutputFile('coverage.wig')),
                       kwargs={'target_file': exome_bed_file})

    workflow.transform(name='run_titan',
                       axes=('param_idx', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_titan,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('run.tar.gz', 'param_idx'),
                             mgd.TempSpace('titan_tmp', 'param_idx')),
                       kwargs={
                           'is_exome': (exome_bed_file is not None),
                           'sample': sample,
                           'threads': threads
                       })

    workflow.transform(name='build_run_stats_file',
                       func=tasks.build_run_stats_file,
                       args=(mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputObj('init_params', 'param_idx'),
                             mgd.TempOutputFile('stats.tsv')))

    workflow.transform(name='build_output',
                       func=tasks.build_final_results_file,
                       args=(mgd.TempInputFile('coverage.wig'),
                             mgd.TempInputFile('allele_counts.tsv'),
                             mgd.TempInputFile('run.tar.gz', 'param_idx'),
                             mgd.TempInputFile('stats.tsv'),
                             mgd.OutputFile(out_file),
                             mgd.TempSpace('build_results')))

    return workflow
Esempio n. 29
0
def create_hmmcopy_workflow(
        bam_file, reads, segs, metrics, params, igv_seg_filename,
        segs_pdf, bias_pdf, plot_heatmap_ec_output,
        plot_metrics_output,
        plot_kernel_density_output, hmmcopy_data_tar,
        cell_ids, hmmparams, sample_info
):
    chromosomes = hmmparams["chromosomes"]

    baseimage = hmmparams['docker']['single_cell_pipeline']
    hmmcopy_docker = hmmparams['docker']['hmmcopy']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(
        obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]),
        value=sample_info)

    workflow.transform(
        name='run_hmmcopy',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']),
            mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'),
            mgd.InputInstance('cell_id'),
            hmmparams,
            mgd.TempSpace('hmmcopy_temp', 'cell_id'),
            hmmcopy_docker
        ),
    )

    workflow.transform(
        name='merge_reads',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='add_mappability_bool',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_mappability_col",
        args=(
            mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']),
            mgd.OutputFile(reads, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_segs',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(segs, extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='merge_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_params',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='get_max_cn',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_max_cn",
        ret=mgd.TempOutputObj('max_cn'),
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
        )
    )

    workflow.transform(
        name='hmmcopy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            hmmparams['ref_genome'],
            mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]),
            mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]),
            mgd.InputInstance('cell_id'),
        ),
        kwargs={
            'num_states': hmmparams['num_states'],
            'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'),
            'max_cn': mgd.TempInputObj("max_cn")
        }
    )

    workflow.transform(
        name='annotate_metrics_with_info_and_clustering',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.add_clustering_order",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
            mgd.OutputFile(metrics, extensions=['.yaml']),
        ),
        kwargs={
            'chromosomes': hmmparams["chromosomes"],
            'sample_info': sample_info
        }
    )

    workflow.transform(
        name='merge_hmm_copy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.merge_pdf",
        args=(
            [
                mgd.TempInputFile('segments.png', 'cell_id'),
                mgd.TempInputFile('bias.png', 'cell_id'),
            ],
            [
                mgd.OutputFile(segs_pdf),
                mgd.OutputFile(bias_pdf),
            ],
            mgd.InputFile(metrics, extensions=['.yaml']),
            None,
            mgd.TempSpace("hmmcopy_plot_merge_temp"),
            ['segments', 'bias']
        )
    )

    workflow.transform(
        name='create_igv_seg',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.create_igv_seg",
        args=(
            mgd.InputFile(segs, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(igv_seg_filename),
            hmmparams,
        )
    )

    workflow.transform(
        name='plot_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_metrics",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_metrics_output),
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_kernel_density',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_kernel_density_output),
            ',',
            'mad_neutral_state',
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_heatmap_ec',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_heatmap_ec_output),
        ),
        kwargs={
            'plot_title': 'QC pipeline metrics',
            'column_name': 'state',
            'plot_by_col': 'experimental_condition',
            'color_by_col': 'cell_call',
            'chromosomes': chromosomes,
            'max_cn': hmmparams['num_states'],
            'scale_by_cells': False,
            'mappability_threshold': hmmparams["map_cutoff"]
        }
    )

    workflow.transform(
        name='merge_hmmcopy_data_tars',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.utils.helpers.tar_files",
        args=(
            mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]),
            mgd.OutputFile(hmmcopy_data_tar),
            mgd.TempSpace("merge_tarballs")
        ),

    )

    return workflow
Esempio n. 30
0
def create_destruct_workflow(normal_stats,
                             normal_reads_1,
                             normal_reads_2,
                             normal_sample_1,
                             normal_sample_2,
                             tumour_stats,
                             tumour_reads_1,
                             tumour_reads_2,
                             tumour_sample_1,
                             tumour_sample_2,
                             destruct_config,
                             ref_data_directory,
                             breakpoints_filename,
                             breakpoints_library_filename,
                             cell_counts_filename,
                             raw_data_directory,
                             normal_sample_id='normal',
                             tumour_sample_id='tumour',
                             tumour_library_id='tumour'):
    tumour_sample_id = '_'.join([tumour_sample_id, tumour_library_id])
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name="get_destruct_config",
                       func="destruct.defaultconfig.get_config",
                       ret=mgd.TempOutputObj("destruct_config"),
                       args=(ref_data_directory, destruct_config))

    workflow.subworkflow(
        name='destruct',
        func="destruct.workflow.create_destruct_fastq_workflow",
        ctx={'disk': 200},
        args=(
            {
                normal_sample_id: mgd.InputFile(normal_reads_1),
                tumour_sample_id: mgd.InputFile(tumour_reads_1),
            },
            {
                normal_sample_id: mgd.InputFile(normal_reads_2),
                tumour_sample_id: mgd.InputFile(tumour_reads_2),
            },
            {
                normal_sample_id: mgd.InputFile(normal_sample_1),
                tumour_sample_id: mgd.InputFile(tumour_sample_1),
            },
            {
                normal_sample_id: mgd.InputFile(normal_sample_2),
                tumour_sample_id: mgd.InputFile(tumour_sample_2),
            },
            {
                normal_sample_id: mgd.InputFile(normal_stats),
                tumour_sample_id: mgd.InputFile(tumour_stats),
            },
            mgd.TempOutputFile('breakpoint_table'),
            mgd.TempOutputFile('breakpoint_library_table'),
            mgd.TempOutputFile('breakpoint_read_table'),
            mgd.TempInputObj("destruct_config"),
            ref_data_directory,
        ),
        kwargs={
            'raw_data_dir': raw_data_directory,
        },
    )

    workflow.transform(
        name='filter_annotate_breakpoints',
        ctx={'mem': 8},
        func=
        "biowrappers.components.breakpoint_calling.destruct.tasks.filter_annotate_breakpoints",
        args=(
            pypeliner.managed.TempInputFile('breakpoint_table'),
            pypeliner.managed.TempInputFile('breakpoint_library_table'),
            [normal_sample_id],
            pypeliner.managed.TempOutputFile('breakpoints_filename.csv'),
            pypeliner.managed.TempOutputFile(
                'breakpoints_library_filename.csv'),
        ),
    )

    workflow.transform(
        name='filter_breakpoint_reads',
        ctx={'mem': 8},
        func=
        "single_cell.workflows.destruct_singlecell.tasks.filter_reads_file",
        args=(
            mgd.TempInputFile('breakpoint_read_table'),
            pypeliner.managed.TempInputFile('breakpoints_filename.csv'),
            mgd.TempOutputFile('breakpoint_read_table_filtered'),
        ),
    )

    workflow.transform(
        name='extract_cell_counts',
        ctx={'mem': 8},
        func=
        "single_cell.workflows.destruct_singlecell.tasks.extract_cell_counts",
        args=(
            mgd.TempInputFile('breakpoint_read_table_filtered'),
            mgd.TempOutputFile('cell_counts_filename.csv'),
        ),
    )

    workflow.transform(
        name='prep_cell_counts',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.prep_csv_files",
        args=(
            mgd.TempInputFile('cell_counts_filename.csv'),
            mgd.TempOutputFile("cell_counts_prep.csv.gz",
                               extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_cell_counts',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.finalize_csv",
        args=(
            mgd.TempInputFile("cell_counts_prep.csv.gz", extensions=['.yaml']),
            mgd.OutputFile(cell_counts_filename, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='prep_breakpoints',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.prep_csv_files",
        args=(
            pypeliner.managed.TempInputFile('breakpoints_filename.csv'),
            pypeliner.managed.TempOutputFile(
                "breakpoints_filename_prep.csv.gz", extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_breakpoints',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.finalize_csv",
        args=(
            pypeliner.managed.TempInputFile("breakpoints_filename_prep.csv.gz",
                                            extensions=['.yaml']),
            pypeliner.managed.OutputFile(breakpoints_filename,
                                         extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='prep_breakpoints_library',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.prep_csv_files",
        args=(
            pypeliner.managed.TempInputFile(
                'breakpoints_library_filename.csv'),
            pypeliner.managed.TempOutputFile('breakpoints_library_prep.csv.gz',
                                             extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='finalize_breakpoints_library',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func="single_cell.utils.csvutils.finalize_csv",
        args=(
            pypeliner.managed.TempInputFile('breakpoints_library_prep.csv.gz',
                                            extensions=['.yaml']),
            pypeliner.managed.OutputFile(breakpoints_library_filename,
                                         extensions=['.yaml']),
        ),
    )

    return workflow