Exemple #1
0
def infer_haps_workflow(args):
    config = helpers.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               baseimage=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haps_dir = os.path.join(args["out_dir"], "infer_haps")
    haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv")
    allele_counts_filename = os.path.join(haps_dir, "results",
                                          "allele_counts.tsv")

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    if args['normal']:
        bam_file = normal_cells if normal_cells else normal_wgs
    else:
        bam_file = tumour_cells if tumour_cells else tumour_wgs

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )
        bam_file = mgd.InputFile('tumour.bam',
                                 'cell_id',
                                 fnames=bam_file,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(bam_file, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
        kwargs={'normal': args['normal']},
    )

    return workflow
def merge_bams_workflow(args):
    config = helpers.load_config(args)
    config = config['merge_bams']

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    bam_files = tumour_cells if tumour_cells else normal_cells
    wgs_bams = tumour_wgs if tumour_cells else normal_wgs

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    if isinstance(wgs_bams, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('regions'),
            value=list(wgs_bams.keys()),
        )
        workflow.set_filenames("merged.bam", "region", fnames=wgs_bams)
    else:
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        workflow.set_filenames('merged.bam', 'region', template=wgs_bams)

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('bam_markdups',
                                           'cell_id',
                                           fnames=bam_files,
                                           extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai']),
                             mgd.TempInputObj("region"),
                             config,
                         ))

    workflow.transform(name="get_files",
                       ctx={'mem': config['memory']['med']},
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             wgs_bams, 'region'))

    return workflow
def variant_calling_workflow(args):
    config = helpers.load_config(args)
    config = config['variant_calling']

    meta_yaml = os.path.join(args['out_dir'], 'info.yaml')

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_bams = data['tumour_wgs']
    normal_bams = data['normal_wgs']
    tumour_cells = data['tumour_cells']

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling')

    museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz')
    strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz')
    strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz')
    snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5')
    raw_data_dir = os.path.join(varcalls_dir, 'raw')

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(normal_bams, dict) and isinstance(tumour_bams, dict):
        assert list(normal_bams.keys()) == list(tumour_bams.keys(
        )), 'keys for tumour and normal bams should be the same'
        workflow.setobj(
            obj=mgd.OutputChunks('region'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_split.bam',
                               'normal_split',
                               fnames=normal_bams)
        workflow.set_filenames('tumour_split.bam',
                               'normal_split',
                               fnames=tumour_bams)
    else:
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        assert '{region}' in normal_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('normal_split.bam',
                               'region',
                               template=normal_bams)
        assert '{region}' in tumour_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('tumour_split.bam',
                               'region',
                               template=normal_bams)

    workflow.subworkflow(
        func=create_variant_calling_workflow,
        name='create_varcall',
        args=(
            tumour_cells,
            mgd.InputFile('tumour_split.bam', 'region', extensions=['bai']),
            mgd.InputFile('normal_split.bam', 'region', extensions=['bai']),
            mgd.OutputFile(museq_vcf),
            mgd.OutputFile(strelka_snv_vcf),
            mgd.OutputFile(strelka_indel_vcf),
            mgd.OutputFile(snv_h5),
            mgd.OutputFile(meta_yaml),
            config,
            raw_data_dir,
        ),
    )

    return workflow
Exemple #4
0
def copy_number_calling_workflow(args):

    config = helpers.load_config(args)
    config = config['copy_number_calling']

    pyp = pypeliner.app.Pypeline(config=args)

    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1,
           'docker_image': config['docker']['single_cell_pipeline']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    assert '{region}' in normal_wgs


    copynumber_dir = os.path.join(args["out_dir"], "copynumber")

    out_file = os.path.join(copynumber_dir, "results", "results.h5")

    cloneid = args["clone_id"]

    remixt_config = config.get('extract_seqdata', {})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.transform(
        name="get_regions",
        ctx=dict(mem=config['memory']['low']),
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=mgd.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        )
    )

    workflow.transform(
        name="get_snp_positions_filename",
        func="remixt.config.get_filename",
        ret=mgd.TempOutputObj('snp_positions_filename'),
        args=(
              remixt_config,
              config['ref_data_dir'],
              'snp_positions'
        )
    )

    workflow.transform(
        name="get_bam_max_fragment_length",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_fragment_length'),
        args=(
              remixt_config,
              'bam_max_fragment_length'
        )
    )

    workflow.transform(
        name="get_bam_max_soft_clipped",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_soft_clipped'),
        args=(
              remixt_config,
              'bam_max_soft_clipped'
        )
    )

    workflow.transform(
        name="get_bam_check_proper_pair",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_check_proper_pair'),
        args=(
              remixt_config,
              'bam_check_proper_pair'
        )
    )


    workflow.subworkflow(
        name="extract_seqdata_tumour",
        axes=('tumour_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'tumour_cell_id',
                fnames=tumour_cells,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("tumour.h5", "tumour_cell_id"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config
        )
    )

    workflow.subworkflow(
        name="extract_seqdata_normal",
        axes=('region',),
        ctx={'disk': 200},
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'region',
                template=normal_wgs,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("normal.h5", "region"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config,
        )
    )

    workflow.subworkflow(
        name='titan_workflow',
        func=titan.create_titan_workflow,
        args=(
            mgd.TempInputFile("normal.h5", "region"),
            mgd.TempInputFile("tumour.h5", "tumour_cell_id"),
            config['ref_genome'],
            copynumber_dir,
            mgd.OutputFile(out_file),
            config,
            args,
            tumour_cells.keys(),
            mgd.InputChunks('region'),
            cloneid
        ),
    )

    pyp.run(workflow)
def germline_calling_workflow(args):
    config = helpers.load_config(args)
    config = config['germline_calling']

    baseimage = config['docker']['single_cell_pipeline']

    basedocker = {'docker_image': config['docker']['single_cell_pipeline']}
    vcftoolsdocker = {'docker_image': config['docker']['vcftools']}
    samtoolsdocker = {'docker_image': config['docker']['samtools']}
    snpeffdocker = {'docker_image': config['docker']['snpeff']}

    pyp = pypeliner.app.Pypeline(config=args)

    ctx = {
        'mem_retry_increment': 2,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'disk_retry_increment': 50,
        'docker_image': baseimage
    },
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    normal_bams = data['normal_bams']
    tumour_cells = data['tumour_cells']

    if not isinstance(normal_bams, dict):
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        assert '{region}' in normal_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('normal_split.bam',
                               'region',
                               template=normal_bams)
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('region'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_split.bam',
                               'normal_split',
                               fnames=normal_bams)

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling')

    samtools_germline_vcf = os.path.join(varcalls_dir, 'raw',
                                         'samtools_germline.vcf.gz')
    snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf')
    normal_genotype_filename = os.path.join(varcalls_dir, 'raw',
                                            'normal_genotype.h5')
    mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5')
    counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5')
    germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.subworkflow(name='samtools_germline',
                         func=germline.create_samtools_germline_workflow,
                         args=(
                             mgd.InputFile("normal_split.bam",
                                           "region",
                                           extensions=['.bai']),
                             config['ref_genome'],
                             mgd.OutputFile(samtools_germline_vcf,
                                            extensions=['.tbi']),
                             config,
                         ),
                         kwargs={
                             'vcftools_docker': vcftoolsdocker,
                             'samtools_docker': samtoolsdocker,
                         })

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(mappability_filename),
        ),
        kwargs={
            'base_docker': basedocker,
            'chromosomes': config['chromosomes']
        })

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        args=(
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(normal_genotype_filename),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(snpeff_vcf_filename),
        ),
        kwargs={
            'hdf5_output': False,
            'base_docker': basedocker,
            'vcftools_docker': vcftoolsdocker,
            'snpeff_docker': snpeffdocker,
        })

    workflow.subworkflow(
        name='read_counts',
        func=
        "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow",
        args=(
            mgd.InputFile('tumour.bam',
                          'cell_id',
                          fnames=tumour_cells,
                          extensions=['.bai']),
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(counts_template),
            config['memory'],
        ),
        kwargs={
            'table_name': '/germline_allele_counts',
        },
    )

    workflow.transform(
        name='build_results_file',
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            [
                mgd.InputFile(counts_template),
                mgd.InputFile(mappability_filename),
                mgd.InputFile(normal_genotype_filename),
            ],
            pypeliner.managed.OutputFile(germline_h5_filename),
        ),
        kwargs={
            'drop_duplicates': True,
        })

    pyp.run(workflow)
Exemple #6
0
def breakpoint_calling_workflow(args):
    run_destruct = args['destruct']
    run_lumpy = args['lumpy']
    if not any((run_destruct, run_lumpy)):
        run_destruct = True
        run_lumpy = True

    config = helpers.load_config(args)
    config = config['breakpoint_calling']

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_cells = data['tumour_cells']
    tumour_cells_id = data['tumour_cells_id']

    normal_bams = data['normal_wgs'] if data['normal_wgs'] else data[
        'normal_cells']
    normal_id = data['normal_wgs_id'] if data['normal_wgs_id'] else data[
        'normal_cells_id']

    calls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling')
    raw_data_directory = os.path.join(calls_dir, 'raw')
    breakpoints_filename = os.path.join(calls_dir, 'breakpoints.h5')
    breakpoints_lib_filename = os.path.join(calls_dir, 'breakpoints_lib.h5')
    cell_counts_filename = os.path.join(calls_dir, 'cell_counts.h5')

    ref_data_directory = config['ref_data_directory']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    if isinstance(normal_bams, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_cells.bam',
                               'normal_cell_id',
                               fnames=normal_bams)
        normal_bam = mgd.InputFile('normal_cells.bam',
                                   'normal_cell_id',
                                   extensions=['.bai'])
    else:
        normal_bam = mgd.InputFile(normal_bams, extensions=['.bai'])

    if run_destruct:
        workflow.subworkflow(
            name='destruct',
            ctx={'docker_image': config['docker']['destruct']},
            func=
            "single_cell.workflows.destruct_singlecell.create_destruct_workflow",
            args=(
                normal_bam,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells),
                config.get('destruct', {}),
                ref_data_directory,
                mgd.OutputFile(breakpoints_filename),
                mgd.OutputFile(breakpoints_lib_filename),
                mgd.OutputFile(cell_counts_filename),
                raw_data_directory,
            ),
        )

    if run_lumpy:
        varcalls_dir = os.path.join(args['out_dir'], 'results',
                                    'breakpoint_calling')
        breakpoints_bed = os.path.join(varcalls_dir, 'lumpy_breakpoints.bed')
        breakpoints_csv = os.path.join(varcalls_dir,
                                       'lumpy_breakpoints.csv.gz')
        breakpoints_evidence_csv = os.path.join(
            varcalls_dir, 'lumpy_breakpoints_evidence.csv.gz')

        workflow.subworkflow(
            name='lumpy',
            func="single_cell.workflows.lumpy.create_lumpy_workflow",
            args=(
                config,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells,
                              extensions=['.bai']),
                normal_bam,
                mgd.OutputFile(breakpoints_bed),
                mgd.OutputFile(breakpoints_csv),
                mgd.OutputFile(breakpoints_evidence_csv),
            ),
            kwargs={
                'tumour_id': tumour_cells_id,
                'normal_id': normal_id
            })

    return workflow