Beispiel #1
0
def create_merge_bams_workflow(
        input_bams,
        merged_bams,
        regions,
        config,
):
    merged_bams = dict([(region, merged_bams[region])
                        for region in regions])


    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(input_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    one_split_job = config["one_split_job"]

    if one_split_job:
        workflow.transform(
            name='merge_bams',
            ctx={'mem': config['memory']['med'], 'ncpus': config['max_cores']},
            func="single_cell.workflows.merge_bams.tasks.merge_bams",
            args=(
                mgd.InputFile('bam', 'cell_id', fnames=input_bams, extensions=['.bai']),
                mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[], extensions=['.bai']),
                regions,
                mgd.TempSpace("merge_bams_tempdir")
            ),
            kwargs={"ncores": config["max_cores"]}
        )
    else:
        workflow.transform(
            name='split_merge_tumour',
            func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
            axes=('region',),
            args=(
                mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=input_bams),
                mgd.OutputFile(
                    'tumour_regions.bam', 'region', axes_origin=[], extensions=['.bai'], fnames=merged_bams),
                mgd.Instance('region'),
            ),
        )

    return workflow
def create_cell_region_merge_workflow(
    cell_bams,
    region_bams,
    regions,
    docker_image,
):
    region_bams = dict([(region, region_bams[region]) for region in regions])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(cell_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    workflow.transform(
        name='split_merge_tumour',
        func='single_cell.workflows.merge_bams.tasks.cell_region_merge_bams',
        axes=('region', ),
        args=(mgd.InputFile('tumour_cells.bam',
                            'cell_id',
                            extensions=['.bai'],
                            fnames=cell_bams),
              mgd.OutputFile('tumour_regions.bam',
                             'region',
                             axes_origin=[],
                             extensions=['.bai'],
                             fnames=region_bams), mgd.Instance('region'),
              docker_image),
    )

    return workflow
Beispiel #3
0
def create_delly_wrapper_workflow(bam_filenames, output_filename, raw_data_dir, control_id=None, ref_genome_fasta_file=None, delly_excl_chrom=None):
    bams = list()
    for lib_id, bam_filename in bam_filenames.items():
        bams += [destruct.benchmark.wrappers.utils.symlink(bam_filename, link_name='{0}.bam'.format(lib_id), link_directory=raw_data_dir)]
        destruct.benchmark.wrappers.utils.symlink(bam_filename+'.bai', link_name='{0}.bam.bai'.format(lib_id), link_directory=raw_data_dir)

    workflow = pypeliner.workflow.Workflow()
    
    workflow.transform(
        name='get_sv_types',
        func=destruct.benchmark.wrappers.delly.tasks.get_sv_types,
        ret=pypeliner.managed.OutputChunks('sv_type'),
        args=(
            mgd.InputFile(ref_genome_fasta_file),
        ),
    )

    workflow.transform(
        name='delly_call',
        axes=('sv_type',),
        ctx={'mem': 64, 'num_retry': 2, 'mem_retry_factor': 2},
        func=destruct.benchmark.wrappers.delly.tasks.run_delly_call,
        args=(
            mgd.Instance('sv_type'),
            delly_excl_chrom,
            ref_genome_fasta_file,
            [mgd.InputFile(bam) for bam in bams],
            mgd.TempOutputFile('out.bcf', 'sv_type'),
        ),
    )

    if control_id is None:
        concat_input = mgd.TempInputFile('out.bcf', 'sv_type')

    else:
        workflow.transform(
            name='delly_filter_somatic',
            axes=('sv_type',),
            ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2},
            func=destruct.benchmark.wrappers.delly.tasks.run_delly_filter,
            args=(
                mgd.Instance('sv_type'),
                bam_filenames.keys(),
                control_id, 
                mgd.TempSpace('samples.tsv'),
                ref_genome_fasta_file,
                mgd.TempInputFile('out.bcf', 'sv_type'),
                mgd.TempOutputFile('somatic.bcf', 'sv_type'),
            ),
        )

        concat_input = mgd.TempInputFile('somatic.bcf', 'sv_type')

    workflow.transform(
        name='concatenate_vcf',
        func=destruct.benchmark.wrappers.tasks.concatenate_bcf,
        ctx={'mem': 4, 'num_retry': 2, 'mem_retry_factor': 2},
        args=(
            concat_input,
            mgd.TempOutputFile('somatic.bcf'),
        ),
    )

    workflow.transform(
        name='convert_vcf',
        func=destruct.benchmark.wrappers.delly.tasks.convert_vcf,
        ctx={'mem': 4, 'num_retry': 3, 'mem_retry_increment': 2},
        args=(
            mgd.TempInputFile('somatic.bcf'),
            mgd.OutputFile(output_filename),
        ),
        kwargs={
            'control_id': control_id,
        }
    )

    return workflow
Beispiel #4
0
def create_snv_allele_counts_for_vcf_targets_workflow(
        config,
        bam_files,
        vcf_file,
        out_file,
        docker_config=None,
        chromosomes=default_chromosomes,
        count_duplicates=False,
        min_bqual=0,
        min_mqual=0,
        split_size=int(1e7),
        table_name='snv_allele_counts',
        vcf_to_bam_chrom_map=None):

    ctx = {
        'mem': 2,
        'num_retry': 3,
        'mem_retry_increment': 2,
        'pool_id': config['pools']['standard'],
        'ncpus': 1
    }
    if docker_config:
        ctx.update(docker_config)

    workflow = pypeliner.workflow.Workflow(default_ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=bam_files.keys(),
    )

    workflow.transform(
        name='get_snv_allele_counts_for_vcf_targets',
        axes=('cell_id', ),
        func=
        "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets",
        args=(
            mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files),
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('counts.h5', 'cell_id'),
            table_name,
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map,
            'cell_id': mgd.Instance('cell_id'),
            'report_zero_count_positions': False,
        })

    workflow.transform(
        name='merge_snv_allele_counts',
        ctx={
            'mem': config["memory"]['high'],
            'pool_id': config['pools']['highmem'],
            'ncpus': 1
        },
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            mgd.TempInputFile('counts.h5', 'cell_id'),
            mgd.OutputFile(out_file),
        ),
        kwargs={
            'in_memory': False,
        },
    )

    return workflow
Beispiel #5
0
def destruct_multi_sample_workflow(
    normal_bam,
    tumour_bam_files,
    destruct_config,
    config,
    destruct_ref_data_dir,
    breakpoints_csv,
    breakpoints_library_csv,
    cell_counts_csv,
    raw_data_dir,
    normal_sample_id='normal',
):
    ctx = {'docker_image': config['docker']['destruct']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_bam_files.keys()),
    )

    keys = [(sample_id, library_id)
            for (sample_id, library_id, _) in list(tumour_bam_files.keys())]
    keys = sorted(set(keys))

    breakpoints_csv = dict([(key, breakpoints_csv(*key)) for key in keys])
    breakpoints_library_csv = dict([(key, breakpoints_library_csv(*key))
                                    for key in keys])
    cell_counts_csv = dict([(key, cell_counts_csv(*key)) for key in keys])

    workflow.set_filenames('tumour_cells.bam',
                           'sample_id',
                           'library_id',
                           'cell_id',
                           fnames=tumour_bam_files)
    workflow.set_filenames('breakpoints.csv',
                           'sample_id',
                           'library_id',
                           fnames=breakpoints_csv)
    workflow.set_filenames('breakpoints_library.csv',
                           'sample_id',
                           'library_id',
                           fnames=breakpoints_library_csv)
    workflow.set_filenames('cell_counts.csv',
                           'sample_id',
                           'library_id',
                           fnames=cell_counts_csv)

    workflow.subworkflow(
        name='normal_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        args=(
            normal_bam,
            mgd.TempOutputFile('normal_stats'),
            mgd.TempOutputFile('normal_reads_1.fastq.gz'),
            mgd.TempOutputFile('normal_reads_2.fastq.gz'),
            mgd.TempOutputFile('normal_sample_1.fastq.gz'),
            mgd.TempOutputFile('normal_sample_2.fastq.gz'),
            destruct_ref_data_dir,
            destruct_config,
        ),
    )

    workflow.subworkflow(
        name='tumour_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        axes=('sample_id', 'library_id'),
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai']),
            mgd.TempOutputFile('tumour_stats', 'sample_id', 'library_id'),
            mgd.TempOutputFile('tumour_reads_1.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_reads_2.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_sample_1.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_sample_2.fastq.gz', 'sample_id',
                               'library_id'),
            destruct_ref_data_dir,
            destruct_config,
        ),
        kwargs={'tag': True})

    workflow.subworkflow(
        name='run_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.create_destruct_workflow',
        axes=('sample_id', 'library_id'),
        args=(
            mgd.TempInputFile('normal_stats'),
            mgd.TempInputFile('normal_reads_1.fastq.gz'),
            mgd.TempInputFile('normal_reads_2.fastq.gz'),
            mgd.TempInputFile('normal_sample_1.fastq.gz'),
            mgd.TempInputFile('normal_sample_2.fastq.gz'),
            mgd.TempInputFile('tumour_stats', 'sample_id', 'library_id'),
            mgd.TempInputFile('tumour_reads_1.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_reads_2.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_sample_1.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_sample_2.fastq.gz', 'sample_id',
                              'library_id'),
            destruct_config,
            destruct_ref_data_dir,
            mgd.OutputFile('breakpoints.csv', 'sample_id', 'library_id'),
            mgd.OutputFile('breakpoints_library.csv', 'sample_id',
                           'library_id'),
            mgd.OutputFile('cell_counts.csv', 'sample_id', 'library_id'),
            mgd.Template(raw_data_dir, 'sample_id', 'library_id'),
        ),
        kwargs={
            'tumour_sample_id': mgd.Instance('sample_id'),
            'tumour_library_id': mgd.Instance('library_id'),
            'normal_sample_id': normal_sample_id,
        },
    )

    return workflow
def create_snv_allele_counts_for_vcf_targets_workflow(
    bam_files,
    vcf_file,
    out_file,
    memory_cfg,
    count_duplicates=False,
    min_bqual=0,
    min_mqual=0,
    table_name='snv_allele_counts',
    vcf_to_bam_chrom_map=None,
):
    ctx = {
        'mem': memory_cfg['low'],
        'num_retry': 3,
        'mem_retry_increment': 2,
        'ncpus': 1,
        'disk_retry_increment': 50,
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name='get_snv_allele_counts_for_vcf_targets',
        axes=('sample_id', 'library_id', 'cell_id'),
        func=
        "biowrappers.components.variant_calling.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets",
        args=(
            mgd.InputFile('tumour.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          fnames=bam_files,
                          extensions=['.bai']),
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('counts.h5', 'sample_id', 'library_id',
                               'cell_id'),
            table_name,
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map,
            'cell_id': mgd.Instance('cell_id'),
            'sample_id': mgd.Instance('sample_id'),
            'library_id': mgd.Instance('library_id'),
            'report_zero_count_positions': False,
        })

    workflow.transform(
        name='merge_snv_allele_counts',
        ctx={
            'mem': memory_cfg['high'],
            'disk': 20
        },
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            mgd.TempInputFile('counts.h5', 'sample_id', 'library_id',
                              'cell_id'),
            mgd.TempOutputFile('merged_counts.h5'),
        ),
        kwargs={
            'in_memory': False,
        },
    )

    workflow.transform(name='convert_h5_to_csv',
                       func='single_cell.utils.hdfutils.convert_hdf_to_csv',
                       args=(mgd.TempInputFile('merged_counts.h5'), {
                           '/snv_allele_counts':
                           mgd.OutputFile(out_file, extensions=['.yaml']),
                       }))

    return workflow
Beispiel #7
0
def delly_pipeline(
    normal_bam_file,
    tumour_bam_files,
    ref_genome_fasta_file,
    delly_excl_chrom,
    out_file,
    raw_data_dir,
):
    bams = list()
    for lib_id, bam_filename in tumour_bam_files.items():
        bams += [
            utils.symlink(bam_filename,
                          link_name='{0}.bam'.format(lib_id),
                          link_directory=raw_data_dir)
        ]
        utils.symlink(bam_filename + '.bai',
                      link_name='{0}.bam.bai'.format(lib_id),
                      link_directory=raw_data_dir)

    bams += [
        utils.symlink(normal_bam_file,
                      link_name='Normal.bam',
                      link_directory=raw_data_dir)
    ]
    utils.symlink(normal_bam_file + '.bai',
                  link_name='Normal.bam.bai',
                  link_directory=raw_data_dir)

    sample_type = {'Normal': 'control'}
    for lib_id in tumour_bam_files.keys():
        sample_type[lib_id] = 'tumor'

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('sample_type', 'sample_id'),
        value=sample_type,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sv_type'),
        value=('DEL', 'DUP', 'INV', 'TRA', 'INS'),
    )

    workflow.transform(
        name='delly_call',
        axes=('sv_type', ),
        ctx={
            'mem': 64,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_call,
        args=(
            mgd.Instance('sv_type'),
            delly_excl_chrom,
            ref_genome_fasta_file,
            [mgd.InputFile(bam) for bam in bams],
            mgd.TempOutputFile('out.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='write_samples_table',
        ctx={'mem': 1},
        func=tasks.write_samples_table,
        args=(
            mgd.TempInputObj('sample_type', 'sample_id'),
            mgd.TempOutputFile('samples.tsv'),
        ),
    )

    workflow.transform(
        name='delly_filter_somatic',
        axes=('sv_type', ),
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        func=tasks.run_delly_filter,
        args=(
            mgd.Instance('sv_type'),
            mgd.TempInputFile('samples.tsv'),
            ref_genome_fasta_file,
            mgd.TempInputFile('out.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf', 'sv_type'),
        ),
    )

    workflow.transform(
        name='concatenate_vcf',
        func=vcf_tasks.concatenate_bcf,
        ctx={
            'mem': 4,
            'num_retry': 2,
            'mem_retry_factor': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf', 'sv_type'),
            mgd.TempOutputFile('somatic.bcf'),
        ),
    )

    workflow.transform(
        name='convert_vcf',
        func=tasks.convert_vcf,
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            mgd.TempInputFile('somatic.bcf'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Beispiel #8
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_template = '{sample_id}_{library_id}_counts.csv.gz'
    counts_output_template = os.path.join(args['out_dir'], counts_template)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(
        name='merge_snvs_museq',
        func='single_cell.utils.vcfutils.merge_vcf',
        args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files],
              mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi',
                                                               '.csi']),
              mgd.TempSpace("merge_vcf_temp")),
    )

    workflow.subworkflow(
        name='count_alleles',
        axes=('sample_id', 'library_id'),
        func=
        'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
            mgd.Instance('sample_id'),
            mgd.Instance('library_id'),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping',
                'counts': {
                    'template': counts_template,
                    'instances': sample_library,
                }
            }
        })

    return workflow