def variant_counting_workflow(args):
    config = helpers.load_config(args)

    meta_yaml = os.path.join(args['out_dir'], 'info.yaml')

    bam_files, bai_files = helpers.get_bams(args['input_yaml'])
    vcfs = args['input_vcfs']
    results_file = os.path.join(args['out_dir'], 'results', 'variant_counting',
                                'counts.h5')

    return create_variant_counting_workflow(vcfs, bam_files, results_file,
                                            meta_yaml, config)
Exemple #2
0
def infer_haps_workflow(args):
    config = helpers.load_config(args)
    config = config['infer_haps']
    baseimage = config['docker']['single_cell_pipeline']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               baseimage=baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    haps_dir = os.path.join(args["out_dir"], "infer_haps")
    haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv")
    allele_counts_filename = os.path.join(haps_dir, "results",
                                          "allele_counts.tsv")

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    if args['normal']:
        bam_file = normal_cells if normal_cells else normal_wgs
    else:
        bam_file = tumour_cells if tumour_cells else tumour_wgs

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )
        bam_file = mgd.InputFile('tumour.bam',
                                 'cell_id',
                                 fnames=bam_file,
                                 extensions=['.bai'])
    else:
        bam_file = mgd.InputFile(bam_file, extensions=['.bai'])

    workflow.subworkflow(
        name='infer_haps',
        func=infer_haps,
        args=(
            bam_file,
            mgd.OutputFile(haplotypes_filename),
            mgd.OutputFile(allele_counts_filename),
            config,
        ),
        kwargs={'normal': args['normal']},
    )

    return workflow
Exemple #3
0
def split_bam_workflow(args):
    workflow = pypeliner.workflow.Workflow()
    config = helpers.load_config(args)
    config = config['split_bam']

    baseimage = config['docker']['single_cell_pipeline']

    split_bam_template = args["split_bam_template"]

    by_reads = False if "{region}" in split_bam_template else True
    splitkeyword = "region" if "{region}" in split_bam_template else "reads"

    if by_reads:
        splitnames = [str(i) for i in range(config["num_splits_byreads"])]

        workflow.setobj(
            obj=mgd.OutputChunks('reads'),
            value=splitnames,
        )

    else:
        workflow.transform(
            name="get_regions",
            ctx={
                'mem': config['memory']['low'],
                'ncpus': 1,
                'docker_image': baseimage
            },
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.TempOutputObj('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))

    workflow.subworkflow(name="split_normal",
                         func=split_bams.create_split_workflow,
                         args=(
                             mgd.InputFile(args['wgs_bam']),
                             mgd.OutputFile("normal.split.bam",
                                            splitkeyword,
                                            template=split_bam_template,
                                            axes_origin=[]),
                             pypeliner.managed.TempInputObj(splitkeyword),
                             config,
                         ),
                         kwargs={"by_reads": by_reads})

    return workflow
Exemple #4
0
def variant_calling_workflow(args):

    config = helpers.load_config(args)

    ctx = {'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    meta_yaml = os.path.join(args['out_dir'], 'info.yaml')

    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    cellids = helpers.get_samples(args['input_yaml'])

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling')

    museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz')
    strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz')
    strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz')
    snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5')
    raw_data_dir = os.path.join(varcalls_dir, 'raw')

    wgs_bam_template = args["tumour_template"]
    normal_bam_template = args["normal_template"]

    regions = refgenome.get_split_regions(config["split_size"])

    tumour_region_bams = {
        r: wgs_bam_template.format(region=r)
        for r in regions
    }
    normal_region_bams = {
        r: normal_bam_template.format(region=r)
        for r in regions
    }

    return create_variant_calling_workflow(
        bam_files,
        tumour_region_bams,
        normal_region_bams,
        museq_vcf,
        strelka_snv_vcf,
        strelka_indel_vcf,
        snv_h5,
        config,
        raw_data_dir,
    )
def ltm_workflow(args):
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_config(args)

    hmmcopy, timepoints = ltmutils.read_input_file(args['input_csv'])

    cn_matrix = os.path.join(args['out_dir'], 'cn_matrix.csv')
    output_gml = os.path.join(args['out_dir'], 'tree.gml')
    output_rooted_gml = os.path.join(args['out_dir'], 'rooted_tree.gml')

    # Outputs required for visualization with cellscape
    cnv_annots_csv = os.path.join(args['out_dir'], 'cnv_annots.csv')
    cnv_tree_edges_csv = os.path.join(args['out_dir'], 'cnv_tree_edges.csv')
    cnv_data_csv = os.path.join(args['out_dir'], 'cnv_data.csv')
    output_rmd = os.path.join(args['out_dir'], 'cellscape.Rmd')
    root_id_file = os.path.join(args['out_dir'], 'root_id.txt')

    workflow.setobj(
        obj=mgd.OutputChunks('timepoint'),
        value=timepoints,
    )

    workflow.subworkflow(
        name='ltm_scale',
        func=ltm.create_ltm_workflow,
        args=(
            mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy),
            mgd.OutputFile(cn_matrix),
            mgd.OutputFile(output_gml),
            mgd.OutputFile(output_rooted_gml),
            mgd.OutputFile(cnv_annots_csv),
            mgd.OutputFile(cnv_tree_edges_csv),
            mgd.OutputFile(cnv_data_csv),
            mgd.OutputFile(output_rmd),
            config,
            args['root_id'],
            mgd.OutputFile(root_id_file),
            args['number_of_jobs'],
            args['ploidy'],
        ),
    )

    return workflow
def multi_sample_pipeline(args):
    data = helpers.load_yaml(args['input_yaml'])

    tumour_cell_bams = load_tumour_data(data)
    normal_sample_id, normal_libraries, normal_bams = load_normal_data(data)

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = create_multi_sample_workflow(
        normal_bams,
        tumour_cell_bams,
        helpers.load_config(args),
        destruct_dir=args['destruct_output'],
        lumpy_dir=args['lumpy_output'],
        haps_dir=args['haps_output'],
        varcall_dir=args["variants_output"],
        normal_sample_id=normal_sample_id)

    pyp.run(workflow)

    generate_meta_files(normal_sample_id, normal_libraries, tumour_cell_bams,
                        args)
Exemple #7
0
def split_bam_workflow(workflow, args):

    config = helpers.load_config(args)

    info_file = os.path.join(args["out_dir"], 'results', 'split_bam',
                             'info.yaml')
    split_bam_template = args["split_bam_template"]
    split_bai_template = args["split_bam_template"] + ".bai"

    by_reads = False if "{region}" in split_bam_template else True
    splitkeyword = "region" if "{region}" in split_bam_template else "reads"

    if by_reads:
        splitnames = [str(i) for i in range(config["num_splits_byreads"])]

        workflow.setobj(
            obj=mgd.OutputChunks('reads'),
            value=splitnames,
        )

    else:
        workflow.transform(
            name="get_regions",
            ctx={
                'mem': 2,
                'num_retry': 3,
                'mem_retry_increment': 2,
                'pool_id': config['pools']['standard'],
                'ncpus': 1
            },
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.TempOutputObj('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))

    workflow.subworkflow(name="split_normal",
                         func=split_bams.create_split_workflow,
                         args=(
                             mgd.InputFile(args['wgs_bam']),
                             mgd.InputFile(args['wgs_bam'] + ".bai"),
                             mgd.OutputFile("normal.split.bam",
                                            splitkeyword,
                                            template=split_bam_template,
                                            axes_origin=[]),
                             mgd.OutputFile("normal.split.bam.bai",
                                            splitkeyword,
                                            template=split_bai_template,
                                            axes_origin=[]),
                             pypeliner.managed.TempInputObj(splitkeyword),
                             config,
                         ),
                         kwargs={"by_reads": by_reads})

    regions = mgd.InputChunks(
        'reads') if by_reads else pypeliner.managed.TempInputObj('region')
    workflow.transform(name="get_files",
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             split_bam_template, 'region'))

    metadata = {
        'split_bams': {
            'name': 'merge_bams',
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': pypeliner.managed.TempInputObj('outputs'),
            'input_datasets': args['wgs_bam'],
            'results': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(
                           mem=config['memory']['med'],
                           pool_id=config['pools']['standard'],
                       ),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
def merge_bams_workflow(workflow, args):

    input_yaml = args["input_yaml"]
    output_template = args["merged_bam_template"]

    info_file = os.path.join(args["out_dir"], 'results', 'merge_bams',
                             "info.yaml")
    config = helpers.load_config(args)
    bam_files, bai_files = helpers.get_bams(input_yaml)
    cellids = helpers.get_samples(input_yaml)

    wgs_bam_template = output_template
    wgs_bai_template = wgs_bam_template + ".bai"

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    ctx.update(
        helpers.get_container_ctx(config['containers'],
                                  'single_cell_pipeline'))

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cellids,
    )

    workflow.transform(
        name="get_regions",
        ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx),
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.TempOutputObj('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('bam_markdups',
                                           'cell_id',
                                           fnames=bam_files,
                                           extensions=['.bai']),
                             mgd.OutputFile("merged_bam",
                                            "region",
                                            axes_origin=[],
                                            template=wgs_bam_template,
                                            extensions=['.bai']),
                             cellids,
                             config,
                             mgd.TempInputObj("region"),
                         ))

    workflow.transform(name="get_files",
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             wgs_bam_template, 'region'))

    inputs = {k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems()}

    metadata = {
        'merge_bams': {
            'name': 'merge_bams',
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': pypeliner.managed.TempInputObj('outputs'),
            'input_datasets': inputs,
            'results': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=2,
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
def germline_calling_workflow(args):
    config = helpers.load_config(args)
    config = config['germline_calling']

    baseimage = config['docker']['single_cell_pipeline']

    basedocker = {'docker_image': config['docker']['single_cell_pipeline']}
    vcftoolsdocker = {'docker_image': config['docker']['vcftools']}
    samtoolsdocker = {'docker_image': config['docker']['samtools']}
    snpeffdocker = {'docker_image': config['docker']['snpeff']}

    pyp = pypeliner.app.Pypeline(config=args)

    ctx = {
        'mem_retry_increment': 2,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'disk_retry_increment': 50,
        'docker_image': baseimage
    },
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    normal_bams = data['normal_bams']
    tumour_cells = data['tumour_cells']

    if not isinstance(normal_bams, dict):
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        assert '{region}' in normal_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('normal_split.bam',
                               'region',
                               template=normal_bams)
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('region'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_split.bam',
                               'normal_split',
                               fnames=normal_bams)

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling')

    samtools_germline_vcf = os.path.join(varcalls_dir, 'raw',
                                         'samtools_germline.vcf.gz')
    snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf')
    normal_genotype_filename = os.path.join(varcalls_dir, 'raw',
                                            'normal_genotype.h5')
    mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5')
    counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5')
    germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.subworkflow(name='samtools_germline',
                         func=germline.create_samtools_germline_workflow,
                         args=(
                             mgd.InputFile("normal_split.bam",
                                           "region",
                                           extensions=['.bai']),
                             config['ref_genome'],
                             mgd.OutputFile(samtools_germline_vcf,
                                            extensions=['.tbi']),
                             config,
                         ),
                         kwargs={
                             'vcftools_docker': vcftoolsdocker,
                             'samtools_docker': samtoolsdocker,
                         })

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(mappability_filename),
        ),
        kwargs={
            'base_docker': basedocker,
            'chromosomes': config['chromosomes']
        })

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        args=(
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(normal_genotype_filename),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(snpeff_vcf_filename),
        ),
        kwargs={
            'hdf5_output': False,
            'base_docker': basedocker,
            'vcftools_docker': vcftoolsdocker,
            'snpeff_docker': snpeffdocker,
        })

    workflow.subworkflow(
        name='read_counts',
        func=
        "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow",
        args=(
            mgd.InputFile('tumour.bam',
                          'cell_id',
                          fnames=tumour_cells,
                          extensions=['.bai']),
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(counts_template),
            config['memory'],
        ),
        kwargs={
            'table_name': '/germline_allele_counts',
        },
    )

    workflow.transform(
        name='build_results_file',
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        args=(
            [
                mgd.InputFile(counts_template),
                mgd.InputFile(mappability_filename),
                mgd.InputFile(normal_genotype_filename),
            ],
            pypeliner.managed.OutputFile(germline_h5_filename),
        ),
        kwargs={
            'drop_duplicates': True,
        })

    pyp.run(workflow)
def align_workflow(workflow, args):

    config = helpers.load_config(args)

    sampleinfo = helpers.get_sample_info(args['input_yaml'])

    cellids = helpers.get_samples(args['input_yaml'])
    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    lib = args["library_id"]

    outdir = os.path.join(args["out_dir"], "results", "alignment")

    info_file = os.path.join(outdir, "info.yaml")

    alignment_metrics_h5 = os.path.join(outdir,
                                        '{}_alignment_metrics.h5'.format(lib))

    plots_dir = os.path.join(outdir, 'plots')
    plot_metrics_output = os.path.join(plots_dir,
                                       '{}_plot_metrics.pdf'.format(lib))

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    ctx.update(
        helpers.get_container_ctx(config['containers'],
                                  'single_cell_pipeline'))

    if not args["metrics_only"]:
        fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml'])
        instrumentinfo = helpers.get_instrument_info(args['input_yaml'])
        centerinfo = helpers.get_center_info(args['input_yaml'])

        workflow.setobj(
            obj=mgd.OutputChunks('cell_id', 'lane'),
            value=fastq1_files.keys(),
        )

        workflow.subworkflow(
            name='alignment_workflow',
            func=align.create_alignment_workflow,
            args=(
                mgd.InputFile('fastq_1',
                              'cell_id',
                              'lane',
                              fnames=fastq1_files,
                              axes_origin=[]),
                mgd.InputFile('fastq_2',
                              'cell_id',
                              'lane',
                              fnames=fastq2_files,
                              axes_origin=[]),
                mgd.OutputFile('bam_markdups',
                               'cell_id',
                               fnames=bam_files,
                               axes_origin=[]),
                mgd.OutputFile('bai_markdups',
                               'cell_id',
                               fnames=bai_files,
                               axes_origin=[]),
                config['ref_genome'],
                config,
                args,
                instrumentinfo,
                centerinfo,
                sampleinfo,
                cellids,
            ),
        )
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=cellids,
        )

    workflow.subworkflow(
        name='metrics_workflow',
        func=alignment_metrics.create_alignment_metrics_workflow,
        args=(
            mgd.InputFile('bam_markdups',
                          'cell_id',
                          fnames=bam_files,
                          axes_origin=[]),
            mgd.InputFile('bai_markdups',
                          'cell_id',
                          fnames=bai_files,
                          axes_origin=[]),
            mgd.OutputFile(alignment_metrics_h5),
            mgd.OutputFile(plot_metrics_output),
            config['ref_genome'],
            config,
            args,
            sampleinfo,
            cellids,
        ),
    )

    inputs = helpers.get_fastq_files(args["input_yaml"])
    outputs = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }

    metadata = {
        'alignment': {
            'name': 'alignment',
            'cell_batch_realign': args["realign"],
            'metrics_table': '/alignment/metrics',
            'gc_metrics_table': '/alignment/gc_metrics',
            'aligner': config["aligner"],
            'adapter': config["adapter"],
            'adapter2': config["adapter2"],
            'picardtools_wgsmetrics_params': config['picard_wgs_params'],
            'ref_genome': config["ref_genome"],
            'version': single_cell.__version__,
            'containers': config['containers'],
            'output_datasets': outputs,
            'input_datasets': inputs,
            'results': {
                'alignment_metrics':
                helpers.format_file_yaml(alignment_metrics_h5),
                'alignment_plots':
                helpers.format_file_yaml(plot_metrics_output),
            },
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                **ctx),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
Exemple #11
0
def ltm_workflow(workflow, args):

    config = helpers.load_config(args)

    hmmcopy, timepoints = ltmutils.read_input_file(args['input_csv'])

    cn_matrix = os.path.join(args['out_dir'], 'cn_matrix.csv')
    output_gml = os.path.join(args['out_dir'], 'tree.gml')
    output_rooted_gml = os.path.join(args['out_dir'], 'rooted_tree.gml')

    # Outputs required for visualization with cellscape
    cnv_annots_csv = os.path.join(args['out_dir'], 'cnv_annots.csv')
    cnv_tree_edges_csv = os.path.join(args['out_dir'], 'cnv_tree_edges.csv')
    cnv_data_csv = os.path.join(args['out_dir'], 'cnv_data.csv')
    output_rmd = os.path.join(args['out_dir'], 'cellscape.Rmd')
    root_id_file = os.path.join(args['out_dir'], 'root_id.txt')

    workflow.setobj(
        obj=mgd.OutputChunks('timepoint'),
        value=timepoints,
    )

    workflow.subworkflow(
        name='ltm_scale',
        func=ltm.create_ltm_workflow,
        args=(
            mgd.InputFile('hmmcopy.h5', 'timepoint', fnames=hmmcopy),
            mgd.OutputFile(cn_matrix),
            mgd.OutputFile(output_gml),
            mgd.OutputFile(output_rooted_gml),
            mgd.OutputFile(cnv_annots_csv),
            mgd.OutputFile(cnv_tree_edges_csv),
            mgd.OutputFile(cnv_data_csv),
            mgd.OutputFile(output_rmd),
            config,
            args['root_id'],
            mgd.OutputFile(root_id_file),
            args['number_of_jobs'],
            args['ploidy'],
        ),
    )

    info_file = os.path.join(args["out_dir"], 'results', 'ltm', "info.yaml")

    results = {
        'ltm_cn_matrix': helpers.format_file_yaml(cn_matrix),
        'ltm_gml': helpers.format_file_yaml(output_gml),
        'ltm_rooted_gml': helpers.format_file_yaml(output_rooted_gml),
        'ltm_cnv_annots_csv': helpers.format_file_yaml(cnv_annots_csv),
        'ltm_cnv_tree_edges_csv': helpers.format_file_yaml(cnv_tree_edges_csv),
        'ltm_cnv_data_csv': helpers.format_file_yaml(cnv_data_csv),
        'ltm_output_rmd': helpers.format_file_yaml(output_rmd)
    }

    input_datasets = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_file.iteritems()
    }

    metadata = {
        'LTM': {
            'chromosomes': config['chromosomes'],
            'ref_genome': config['ref_genome'],
            'cell_filters': config["good_cells"],
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(
                           mem=config['memory']['med'],
                           pool_id=config['pools']['standard'],
                       ),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
Exemple #12
0
def qc_workflow(args):
    config = helpers.load_config(args)

    sampleinfo = helpers.get_sample_info(args['input_yaml'])
    cellids = helpers.get_samples(args['input_yaml'])
    bam_files, _ = helpers.get_bams(args['input_yaml'])

    lib = args["library_id"]

    workflow = pypeliner.workflow.Workflow()

    annotation_only = args['annotation_only']

    alignment_dir = args["alignment_output"]
    hmmcopy_dir = args["hmmcopy_output"]
    annotation_dir = args["annotation_output"]

    if alignment_dir and not annotation_only:
        alignment_files = get_output_files(alignment_dir, 'alignment', lib)

        fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml'])
        triminfo = helpers.get_trim_info(args['input_yaml'])
        centerinfo = helpers.get_center_info(args['input_yaml'])

        workflow.setobj(
            obj=mgd.OutputChunks('cell_id', 'lane'),
            value=list(fastq1_files.keys()),
        )

        workflow.subworkflow(
            name='alignment_workflow',
            ctx={
                'docker_image':
                config['alignment']['docker']['single_cell_pipeline']
            },
            func=align.create_alignment_workflow,
            args=(
                mgd.InputFile('fastq_1',
                              'cell_id',
                              'lane',
                              fnames=fastq1_files,
                              axes_origin=[]),
                mgd.InputFile('fastq_2',
                              'cell_id',
                              'lane',
                              fnames=fastq2_files,
                              axes_origin=[]),
                mgd.OutputFile('bam_markdups',
                               'cell_id',
                               fnames=bam_files,
                               axes_origin=[],
                               extensions=['.bai']),
                mgd.OutputFile(alignment_files['alignment_metrics_csv']),
                mgd.OutputFile(alignment_files['gc_metrics_csv']),
                mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
                mgd.OutputFile(alignment_files['plot_metrics_output']),
                config['alignment']['ref_genome'],
                config['alignment'],
                triminfo,
                centerinfo,
                sampleinfo,
                cellids,
                mgd.OutputFile(alignment_files['alignment_metrics_tar']),
                lib,
            ),
            kwargs={'realign': args['realign']})

    if hmmcopy_dir and not annotation_only:
        hmmcopy_files = get_output_files(hmmcopy_dir, 'hmmcopy', lib)

        if not alignment_dir:
            workflow.setobj(
                obj=mgd.OutputChunks('cell_id'),
                value=list(bam_files.keys()),
            )

        workflow.subworkflow(
            name='hmmcopy_workflow',
            ctx={
                'docker_image':
                config['hmmcopy']['docker']['single_cell_pipeline']
            },
            func=hmmcopy.create_hmmcopy_workflow,
            args=(mgd.InputFile('bam_markdups',
                                'cell_id',
                                fnames=bam_files,
                                extensions=['.bai']),
                  mgd.OutputFile(hmmcopy_files['reads_csvs']),
                  mgd.OutputFile(hmmcopy_files['segs_csvs']),
                  mgd.OutputFile(hmmcopy_files['metrics_csvs']),
                  mgd.OutputFile(hmmcopy_files['params_csvs']),
                  mgd.OutputFile(hmmcopy_files['igv_csvs']),
                  mgd.OutputFile(hmmcopy_files['segs_pdf']),
                  mgd.OutputFile(hmmcopy_files['bias_pdf']),
                  mgd.OutputFile(hmmcopy_files['heatmap_pdf']),
                  mgd.OutputFile(hmmcopy_files['metrics_pdf']),
                  mgd.OutputFile(hmmcopy_files['kernel_density_pdf']),
                  mgd.OutputFile(hmmcopy_files['hmmcopy_data_tar']), cellids,
                  config['hmmcopy'], sampleinfo),
        )

    if annotation_dir:
        annotation_files = get_output_files(annotation_dir, 'annotation', lib)
        if not hmmcopy_dir or not alignment_dir:
            raise Exception(
                '--hmmcopy_output and --alignment_output are required to run annotation'
            )

        alignment_files = get_output_files(alignment_dir, 'alignment', lib)
        hmmcopy_files = get_output_files(hmmcopy_dir, 'hmmcopy', lib)

        workflow.subworkflow(
            name='annotation_workflow',
            ctx={
                'docker_image':
                config['annotation']['docker']['single_cell_pipeline']
            },
            func=qc_annotation.create_qc_annotation_workflow,
            args=(
                mgd.InputFile(hmmcopy_files['metrics_csvs']),
                mgd.InputFile(hmmcopy_files['reads_csvs']),
                mgd.InputFile(alignment_files['alignment_metrics_csv']),
                mgd.InputFile(alignment_files['gc_metrics_csv']),
                mgd.InputFile(hmmcopy_files['segs_pdf']),
                mgd.OutputFile(annotation_files['merged_metrics_csvs']),
                mgd.OutputFile(annotation_files['qc_report']),
                mgd.OutputFile(annotation_files['corrupt_tree_newick']),
                mgd.OutputFile(annotation_files['consensus_tree_newick']),
                mgd.OutputFile(annotation_files['phylo_csv']),
                mgd.OutputFile(annotation_files['loci_rank_trees']),
                mgd.OutputFile(annotation_files['filtered_data']),
                mgd.OutputFile(annotation_files['corrupt_tree_pdf']),
                mgd.OutputFile(annotation_files['segs_pass']),
                mgd.OutputFile(annotation_files['segs_fail']),
                mgd.OutputFile(annotation_files['corrupt_heatmap_pdf']),
                mgd.OutputFile(annotation_files['heatmap_filt_pdf']),
                config['annotation'],
                lib,
            ),
            kwargs={'no_corrupt_tree': args['no_corrupt_tree']})

    return workflow
Exemple #13
0
def breakpoint_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    normal_bam_file = args['matched_normal']
    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    varcalls_dir = os.path.join(args['out_dir'], 'results',
                                'breakpoint_calling')
    raw_data_directory = os.path.join(varcalls_dir, 'raw')
    breakpoints_filename = os.path.join(varcalls_dir, 'breakpoints.h5')
    ref_data_directory = '/refdata'

    pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=bam_files.keys(),
    )

    workflow.subworkflow(
        name='destruct',
        func=
        "biowrappers.components.breakpoint_calling.destruct.destruct_pipeline",
        args=(
            mgd.InputFile(normal_bam_file),
            mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files),
            config.get('destruct', {}),
            ref_data_directory,
            mgd.OutputFile(breakpoints_filename),
            raw_data_directory,
        ),
    )

    info_file = os.path.join(args["out_dir"], 'results', 'breakpoint_calling',
                             "info.yaml")

    results = {
        'destruct_data': helpers.format_file_yaml(breakpoints_filename),
    }

    input_datasets = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }
    input_datasets = {'normal': normal_bam_file, 'tumour': input_datasets}

    metadata = {
        'breakpoint_calling': {
            'ref_data': ref_data_directory,
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                mem_retry_increment=2,
                                ncpus=1),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
def merge_bams_workflow(args):
    config = helpers.load_config(args)
    config = config['merge_bams']

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_wgs = data['tumour_wgs']
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    normal_cells = data['normal_cells']

    bam_files = tumour_cells if tumour_cells else normal_cells
    wgs_bams = tumour_wgs if tumour_cells else normal_wgs

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    if isinstance(wgs_bams, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('regions'),
            value=list(wgs_bams.keys()),
        )
        workflow.set_filenames("merged.bam", "region", fnames=wgs_bams)
    else:
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        workflow.set_filenames('merged.bam', 'region', template=wgs_bams)

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('bam_markdups',
                                           'cell_id',
                                           fnames=bam_files,
                                           extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai']),
                             mgd.TempInputObj("region"),
                             config,
                         ))

    workflow.transform(name="get_files",
                       ctx={'mem': config['memory']['med']},
                       func='single_cell.utils.helpers.resolve_template',
                       ret=pypeliner.managed.TempOutputObj('outputs'),
                       args=(pypeliner.managed.TempInputObj('region'),
                             wgs_bams, 'region'))

    return workflow
Exemple #15
0
def breakpoint_calling_workflow(args):
    run_destruct = args['destruct']
    run_lumpy = args['lumpy']
    if not any((run_destruct, run_lumpy)):
        run_destruct = True
        run_lumpy = True

    config = helpers.load_config(args)
    config = config['breakpoint_calling']

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_cells = data['tumour_cells']
    tumour_cells_id = data['tumour_cells_id']

    normal_bams = data['normal_wgs'] if data['normal_wgs'] else data[
        'normal_cells']
    normal_id = data['normal_wgs_id'] if data['normal_wgs_id'] else data[
        'normal_cells_id']

    calls_dir = os.path.join(args['out_dir'], 'results', 'breakpoint_calling')
    raw_data_directory = os.path.join(calls_dir, 'raw')
    breakpoints_filename = os.path.join(calls_dir, 'breakpoints.h5')
    breakpoints_lib_filename = os.path.join(calls_dir, 'breakpoints_lib.h5')
    cell_counts_filename = os.path.join(calls_dir, 'cell_counts.h5')

    ref_data_directory = config['ref_data_directory']

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config['docker']['single_cell_pipeline']})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    if isinstance(normal_bams, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_cells.bam',
                               'normal_cell_id',
                               fnames=normal_bams)
        normal_bam = mgd.InputFile('normal_cells.bam',
                                   'normal_cell_id',
                                   extensions=['.bai'])
    else:
        normal_bam = mgd.InputFile(normal_bams, extensions=['.bai'])

    if run_destruct:
        workflow.subworkflow(
            name='destruct',
            ctx={'docker_image': config['docker']['destruct']},
            func=
            "single_cell.workflows.destruct_singlecell.create_destruct_workflow",
            args=(
                normal_bam,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells),
                config.get('destruct', {}),
                ref_data_directory,
                mgd.OutputFile(breakpoints_filename),
                mgd.OutputFile(breakpoints_lib_filename),
                mgd.OutputFile(cell_counts_filename),
                raw_data_directory,
            ),
        )

    if run_lumpy:
        varcalls_dir = os.path.join(args['out_dir'], 'results',
                                    'breakpoint_calling')
        breakpoints_bed = os.path.join(varcalls_dir, 'lumpy_breakpoints.bed')
        breakpoints_csv = os.path.join(varcalls_dir,
                                       'lumpy_breakpoints.csv.gz')
        breakpoints_evidence_csv = os.path.join(
            varcalls_dir, 'lumpy_breakpoints_evidence.csv.gz')

        workflow.subworkflow(
            name='lumpy',
            func="single_cell.workflows.lumpy.create_lumpy_workflow",
            args=(
                config,
                mgd.InputFile('tumour.bam',
                              'tumour_cell_id',
                              fnames=tumour_cells,
                              extensions=['.bai']),
                normal_bam,
                mgd.OutputFile(breakpoints_bed),
                mgd.OutputFile(breakpoints_csv),
                mgd.OutputFile(breakpoints_evidence_csv),
            ),
            kwargs={
                'tumour_id': tumour_cells_id,
                'normal_id': normal_id
            })

    return workflow
Exemple #16
0
def aneufinder_workflow(workflow, args):

    config = helpers.load_config(args)
    cellids = helpers.get_samples(args['input_yaml'])
    bam_files, _  = helpers.get_bams(args['input_yaml'])

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cellids,
    )

    info_file = os.path.join(args["out_dir"],'results', 'aneufinder', "info.yaml")

    output = os.path.join(args['out_dir'], 'results', "aneufinder")

    aneufinder_pdf_file = os.path.join(
        output, 'plots', '{}_reads.pdf'.format(args['library_id']))

    helpers.makedirs(output)

    results_filename = os.path.join(output, '{}_results.h5'.format(args['library_id']))
    workflow.subworkflow(
        name='aneufinder_workflow',
        func=aneufinder.create_aneufinder_workflow,
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files),
            cellids,
            config,
            output,
            mgd.OutputFile(results_filename),
            mgd.OutputFile(aneufinder_pdf_file),
            args['library_id'],
        ),
    )



    results = {
        'aneufinder_plot': helpers.format_file_yaml(aneufinder_pdf_file),
        'aneufinder_data':helpers.format_file_yaml(results_filename),
    }

    input_datasets = {k: helpers.format_file_yaml(v) for k,v in bam_files.iteritems()}

    metadata = {
        'aneufinder':{
            'reads_table': '/aneufinder/reads',
            'segments_table': '/aneufinder/segments/',
            'chromosomes': config['chromosomes'],
            'ref_genome': config['ref_genome'],
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(
        name='generate_meta_yaml',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 mem_retry_increment=2, ncpus=1),
        func="single_cell.utils.helpers.write_to_yaml",
        args=(
            mgd.OutputFile(info_file),
            metadata
        )
    )

    return workflow
def copy_number_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    ctx = {'mem_retry_increment': 2, 'ncpus': 1,
           'mem': config["memory"]['low'],
           'pool_id': config['pools']['standard']}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    tumour_bam_files, tumour_bai_files = helpers.get_bams(args['tumour_yaml'])

    normal_bam_files, normal_bai_files = helpers.get_bams(args['normal_yaml'])

    tumour_cellids = helpers.get_samples(args['tumour_yaml'])

    normal_cellids = helpers.get_samples(args['normal_yaml'])

    if set(tumour_bam_files.keys()) != set(tumour_cellids):
        raise ValueError()

    if set(normal_bam_files.keys()) != set(normal_cellids):
        raise ValueError()

    copynumber_dir = os.path.join(args["out_dir"], "copynumber")

    out_file = os.path.join(copynumber_dir, "results", "results.h5")

    cloneid = args["clone_id"]

    remixt_config = config['titan_params'].get('extract_seqdata', {})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=tumour_cellids,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('normal_cell_id'),
        value=normal_cellids,
    )

    workflow.transform(
        name="get_snp_positions_filename",
        ctx=ctx,
        func="remixt.config.get_filename",
        ret=mgd.TempOutputObj('snp_positions_filename'),
        args=(
              remixt_config,
              config['titan_params']['ref_data_dir'],
              'snp_positions'
        )
    )

    workflow.transform(
        name="get_bam_max_fragment_length",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_fragment_length'),
        args=(
              remixt_config,
              'bam_max_fragment_length'
        )
    )

    workflow.transform(
        name="get_bam_max_soft_clipped",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_soft_clipped'),
        args=(
              remixt_config,
              'bam_max_soft_clipped'
        )
    )

    workflow.transform(
        name="get_bam_check_proper_pair",
        ctx=ctx,
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_check_proper_pair'),
        args=(
              remixt_config,
              'bam_check_proper_pair'
        )
    )


    workflow.subworkflow(
        name="extract_seqdata_tumour",
        axes=('tumour_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'tumour_cell_id',
                fnames=tumour_bam_files),
            mgd.InputFile(
                'bam_markdups_index',
                'tumour_cell_id',
                fnames=tumour_bai_files),
            mgd.TempOutputFile("tumour.h5", "tumour_cell_id"),
            config,
            config['titan_params'].get('extract_seqdata', {}),
            config['titan_params']['ref_data_dir'],
            mgd.TempInputObj('snp_positions_filename'),
            mgd.TempInputObj('bam_max_fragment_length'),
            mgd.TempInputObj('bam_max_soft_clipped'),
            mgd.TempInputObj('bam_check_proper_pair'),
        )
    )

    workflow.subworkflow(
        name="extract_seqdata_normal",
        axes=('normal_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'normal_cell_id',
                fnames=normal_bam_files),
            mgd.InputFile(
                'bam_markdups_index',
                'normal_cell_id',
                fnames=normal_bai_files),
            mgd.TempOutputFile("normal.h5", "normal_cell_id"),
            config,
            config['titan_params'].get('extract_seqdata', {}),
            config['titan_params']['ref_data_dir'],
            mgd.TempInputObj('snp_positions_filename'),
            mgd.TempInputObj('bam_max_fragment_length'),
            mgd.TempInputObj('bam_max_soft_clipped'),
            mgd.TempInputObj('bam_check_proper_pair'),
        )
    )

    workflow.subworkflow(
        name='titan_workflow',
        func=titan.create_titan_workflow,
        args=(
            mgd.TempInputFile("normal.h5", "normal_cell_id"),
            mgd.TempInputFile("tumour.h5", "tumour_cell_id"),
            config['ref_genome'],
            copynumber_dir,
            out_file,
            config,
            args,
            tumour_cellids,
            normal_cellids,
            cloneid
        ),
    )

    info_file = os.path.join(args["out_dir"],'results','copynumber_calling', "info.yaml")

    results = {
        'copynumber_data': helpers.format_file_yaml(out_file),
    }

    tumours = {k: helpers.format_file_yaml(v) for k,v in tumour_bam_files.iteritems()}
    normals = {k: helpers.format_file_yaml(v) for k,v in normal_bam_files.iteritems()}
    input_datasets = {'tumour': tumours, 'normal': normals}

    metadata = {
        'copynumber_calling': {
            'chromosomes': config['chromosomes'],
            'ref_genome': config['ref_genome'],
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(
        name='generate_meta_yaml',
        ctx=dict(mem=config['memory']['med'],
                 pool_id=config['pools']['standard'],
                 mem_retry_increment=2, ncpus=1),
        func="single_cell.utils.helpers.write_to_yaml",
        args=(
            mgd.OutputFile(info_file),
            metadata
        )
    )

    return workflow
def variant_calling_workflow(args):
    config = helpers.load_config(args)
    config = config['variant_calling']

    meta_yaml = os.path.join(args['out_dir'], 'info.yaml')

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    tumour_bams = data['tumour_wgs']
    normal_bams = data['normal_wgs']
    tumour_cells = data['tumour_cells']

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling')

    museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz')
    strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz')
    strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz')
    snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5')
    raw_data_dir = os.path.join(varcalls_dir, 'raw')

    baseimage = config['docker']['single_cell_pipeline']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'docker_image': baseimage
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(normal_bams, dict) and isinstance(tumour_bams, dict):
        assert list(normal_bams.keys()) == list(tumour_bams.keys(
        )), 'keys for tumour and normal bams should be the same'
        workflow.setobj(
            obj=mgd.OutputChunks('region'),
            value=list(normal_bams.keys()),
        )
        workflow.set_filenames('normal_split.bam',
                               'normal_split',
                               fnames=normal_bams)
        workflow.set_filenames('tumour_split.bam',
                               'normal_split',
                               fnames=tumour_bams)
    else:
        workflow.transform(
            name="get_regions",
            func="single_cell.utils.pysamutils.get_regions_from_reference",
            ret=pypeliner.managed.OutputChunks('region'),
            args=(
                config["ref_genome"],
                config["split_size"],
                config["chromosomes"],
            ))
        assert '{region}' in normal_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('normal_split.bam',
                               'region',
                               template=normal_bams)
        assert '{region}' in tumour_bams, 'only supports a list of files or a template on regions'
        workflow.set_filenames('tumour_split.bam',
                               'region',
                               template=normal_bams)

    workflow.subworkflow(
        func=create_variant_calling_workflow,
        name='create_varcall',
        args=(
            tumour_cells,
            mgd.InputFile('tumour_split.bam', 'region', extensions=['bai']),
            mgd.InputFile('normal_split.bam', 'region', extensions=['bai']),
            mgd.OutputFile(museq_vcf),
            mgd.OutputFile(strelka_snv_vcf),
            mgd.OutputFile(strelka_indel_vcf),
            mgd.OutputFile(snv_h5),
            mgd.OutputFile(meta_yaml),
            config,
            raw_data_dir,
        ),
    )

    return workflow
Exemple #19
0
def copy_number_calling_workflow(args):

    config = helpers.load_config(args)
    config = config['copy_number_calling']

    pyp = pypeliner.app.Pypeline(config=args)

    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1,
           'docker_image': config['docker']['single_cell_pipeline']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    data = helpers.load_pseudowgs_input(args['input_yaml'])
    normal_wgs = data['normal_wgs']
    tumour_cells = data['tumour_cells']
    assert '{region}' in normal_wgs


    copynumber_dir = os.path.join(args["out_dir"], "copynumber")

    out_file = os.path.join(copynumber_dir, "results", "results.h5")

    cloneid = args["clone_id"]

    remixt_config = config.get('extract_seqdata', {})

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_cell_id'),
        value=list(tumour_cells.keys()),
    )

    workflow.transform(
        name="get_regions",
        ctx=dict(mem=config['memory']['low']),
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=mgd.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        )
    )

    workflow.transform(
        name="get_snp_positions_filename",
        func="remixt.config.get_filename",
        ret=mgd.TempOutputObj('snp_positions_filename'),
        args=(
              remixt_config,
              config['ref_data_dir'],
              'snp_positions'
        )
    )

    workflow.transform(
        name="get_bam_max_fragment_length",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_fragment_length'),
        args=(
              remixt_config,
              'bam_max_fragment_length'
        )
    )

    workflow.transform(
        name="get_bam_max_soft_clipped",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_max_soft_clipped'),
        args=(
              remixt_config,
              'bam_max_soft_clipped'
        )
    )

    workflow.transform(
        name="get_bam_check_proper_pair",
        func="remixt.config.get_param",
        ret=mgd.TempOutputObj('bam_check_proper_pair'),
        args=(
              remixt_config,
              'bam_check_proper_pair'
        )
    )


    workflow.subworkflow(
        name="extract_seqdata_tumour",
        axes=('tumour_cell_id',),
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'tumour_cell_id',
                fnames=tumour_cells,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("tumour.h5", "tumour_cell_id"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config
        )
    )

    workflow.subworkflow(
        name="extract_seqdata_normal",
        axes=('region',),
        ctx={'disk': 200},
        func=extract_seqdata.create_extract_seqdata_workflow,
        args=(
            mgd.InputFile(
                'bam_markdups',
                'region',
                template=normal_wgs,
                extensions=['.bai']
            ),
            mgd.TempOutputFile("normal.h5", "region"),
            config.get('extract_seqdata', {}),
            config['ref_data_dir'],
            config,
        )
    )

    workflow.subworkflow(
        name='titan_workflow',
        func=titan.create_titan_workflow,
        args=(
            mgd.TempInputFile("normal.h5", "region"),
            mgd.TempInputFile("tumour.h5", "tumour_cell_id"),
            config['ref_genome'],
            copynumber_dir,
            mgd.OutputFile(out_file),
            config,
            args,
            tumour_cells.keys(),
            mgd.InputChunks('region'),
            cloneid
        ),
    )

    pyp.run(workflow)
def germline_calling_workflow(workflow, args):

    config = helpers.load_config(args)

    ctx = {
        'mem_retry_increment': 2,
        'ncpus': 1,
        'mem': config["memory"]['low'],
        'pool_id': config['pools']['standard'],
    }
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    bam_files, bai_files = helpers.get_bams(args['input_yaml'])

    sampleids = helpers.get_samples(args['input_yaml'])

    normal_bam_template = args["input_template"]
    normal_bai_template = args["input_template"] + ".bai"

    if "{reads}" in normal_bam_template:
        raise ValueError(
            "input template for germline calling only support region based splits"
        )

    varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling')

    samtools_germline_vcf = os.path.join(varcalls_dir, 'raw',
                                         'samtools_germline.vcf.gz')
    snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf')
    normal_genotype_filename = os.path.join(varcalls_dir, 'raw',
                                            'normal_genotype.h5')
    mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5')
    counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5')
    germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=bam_files.keys(),
    )

    workflow.transform(
        name="get_regions",
        ctx=ctx,
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(name='samtools_germline',
                         func=germline.create_samtools_germline_workflow,
                         args=(
                             mgd.InputFile("normal.split.bam",
                                           "region",
                                           template=normal_bam_template),
                             mgd.InputFile("normal.split.bam.bai",
                                           "region",
                                           template=normal_bai_template),
                             config['ref_genome'],
                             mgd.OutputFile(samtools_germline_vcf,
                                            extensions=['.tbi']),
                             config,
                         ),
                         kwargs={
                             'chromosomes':
                             config["chromosomes"],
                             'base_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'single_cell_pipeline'),
                             'vcftools_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'vcftools'),
                             'samtools_docker':
                             helpers.get_container_ctx(config['containers'],
                                                       'samtools'),
                         })

    workflow.subworkflow(
        name='annotate_mappability',
        func=
        "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow",
        args=(
            config['databases']['mappability']['local_path'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(mappability_filename),
        ),
        kwargs={
            'base_docker':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        })

    workflow.transform(
        name='annotate_genotype',
        func="single_cell.workflows.germline.tasks.annotate_normal_genotype",
        ctx=ctx,
        args=(
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(normal_genotype_filename),
            config["chromosomes"],
        ),
    )

    workflow.subworkflow(
        name='snpeff',
        func=
        "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow",
        args=(
            config['databases']['snpeff']['db'],
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(snpeff_vcf_filename),
        ),
        kwargs={
            'hdf5_output':
            False,
            'base_docker':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline'),
            'vcftools_docker':
            helpers.get_container_ctx(config['containers'], 'vcftools'),
            'snpeff_docker':
            helpers.get_container_ctx(config['containers'], 'snpeff'),
        })

    workflow.subworkflow(
        name='read_counts',
        func=
        "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow",
        args=(
            config,
            mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files),
            mgd.InputFile('tumour.bam.bai', 'cell_id', fnames=bai_files),
            mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']),
            mgd.OutputFile(counts_template),
        ),
        kwargs={
            'table_name':
            '/germline_allele_counts',
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        },
    )

    workflow.transform(
        name='build_results_file',
        func="biowrappers.components.io.hdf5.tasks.concatenate_tables",
        ctx=ctx,
        args=(
            [
                mgd.InputFile(counts_template),
                mgd.InputFile(mappability_filename),
                mgd.InputFile(normal_genotype_filename),
            ],
            pypeliner.managed.OutputFile(germline_h5_filename),
        ),
        kwargs={
            'drop_duplicates': True,
        })

    info_file = os.path.join(args["out_dir"], 'results', 'germline_calling',
                             "info.yaml")

    results = {
        'germline_data': helpers.format_file_yaml(germline_h5_filename),
    }

    input_datasets = {
        k: helpers.format_file_yaml(v)
        for k, v in bam_files.iteritems()
    }

    metadata = {
        'germline_calling': {
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                mem_retry_increment=2,
                                ncpus=1),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow
Exemple #21
0
def infer_haps_workflow(workflow, args):

    config = helpers.load_config(args)
    remixt_config = config['titan_params'].get('extract_seqdata', {})

    singlecellimage = config['docker']['images']['single_cell_pipeline']
    ctx = {
        'mem_retry_increment': 2,
        'ncpus': 1,
        'image': singlecellimage['image'],
        'dockerize': config['docker']['dockerize'],
        'mounts': config['docker']['mounts'],
        'username': singlecellimage['username'],
        'password': singlecellimage['password'],
        'server': singlecellimage['server'],
    }

    haps_dir = os.path.join(args["out_dir"], "infer_haps")

    haplotypes_filename = os.path.join(haps_dir, "results", "haplotypes.tsv")
    allele_counts_filename = os.path.join(haps_dir, "results",
                                          "allele_counts.tsv")

    snp_positions_filename = remixt.config.get_filename(
        config, ref_data_dir, 'snp_positions')
    bam_max_fragment_length = remixt.config.get_param(
        config, 'bam_max_fragment_length')
    bam_max_soft_clipped = remixt.config.get_param(config,
                                                   'bam_max_soft_clipped')
    bam_check_proper_pair = remixt.config.get_param(config,
                                                    'bam_check_proper_pair')

    workflow.setobj(obj=mgd.OutputChunks('chromosome'),
                    value=config['titan_params']['chromosomes'])

    if args['input_yaml']:
        bam_files, bai_files = helpers.get_bams(args['input_yaml'])
        cellids = helpers.get_samples(args['input_yaml'])

        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=cellids,
        )

        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id', ),
            func=extract_seqdata.create_extract_seqdata_workflow,
            args=(
                mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files),
                mgd.InputFile('bam_markdups_index',
                              'cell_id',
                              fnames=bai_files),
                mgd.TempOutputFile("tumour.h5", "cell_id"),
                config,
                config['titan_params'].get('extract_seqdata', {}),
                config['titan_params']['ref_data_dir'],
                snp_positions_filename,
                bam_max_fragment_length,
                bam_max_soft_clipped,
                bam_check_proper_pair,
            ))

        workflow.transform(
            name='merge_all_seqdata',
            ctx=dict(mem=config["memory"]['high'],
                     pool_id=config['pools']['highmem'],
                     **ctx),
            func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata",
            args=(mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"),
                  mgd.TempInputFile("tumour.h5", "cell_id"),
                  config["titan_params"]["chromosomes"]),
        )
    else:
        workflow.subworkflow(
            name="extract_seqdata",
            func=extract_seqdata.create_extract_seqdata_workflow,
            args=(
                mgd.InputFile(args['input_bam']),
                mgd.InputFile(args['input_bam'] + '.bai'),
                mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"),
                config,
                config['titan_params'].get('extract_seqdata', {}),
                config['titan_params']['ref_data_dir'],
                snp_positions_filename,
                bam_max_fragment_length,
                bam_max_soft_clipped,
                bam_check_proper_pair,
            ),
            kwargs={'multiprocess': True})

    if args["normal"]:
        workflow.transform(
            name='infer_snp_genotype',
            axes=('chromosome', ),
            ctx={'mem': 16},
            func='remixt.analysis.haplotype.infer_snp_genotype_from_normal',
            args=(
                mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
                mgd.TempInputFile("seqdata_normal_all_cells_merged.h5"),
                mgd.InputInstance('chromosome'),
                config,
            ),
        )
    else:
        workflow.transform(
            name='infer_snp_genotype',
            axes=('chromosome', ),
            ctx={'mem': 16},
            func='remixt.analysis.haplotype.infer_snp_genotype_from_tumour',
            args=(
                mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
                {
                    'sample':
                    mgd.TempInputFile("seqdata_normal_all_cells_merged.h5")
                },
                mgd.InputInstance('chromosome'),
                config,
            ),
        )

    workflow.transform(name='infer_haps',
                       axes=('chromosome', ),
                       ctx={'mem': 16},
                       func='remixt.analysis.haplotype.infer_haps',
                       args=(
                           mgd.TempOutputFile('haps.tsv', 'chromosome'),
                           mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
                           mgd.InputInstance('chromosome'),
                           mgd.TempSpace('haplotyping', 'chromosome'),
                           config,
                           config['titan_params']['ref_data_dir'],
                       ))

    workflow.transform(name='merge_haps',
                       ctx={'mem': 16},
                       func='remixt.utils.merge_tables',
                       args=(
                           mgd.OutputFile(haplotypes_filename),
                           mgd.TempInputFile('haps.tsv', 'chromosome'),
                       ))

    workflow.transform(
        name='create_segments',
        func='remixt.analysis.segment.create_segments',
        args=(
            mgd.TempOutputFile('segments.tsv'),
            config,
            config['titan_params']['ref_data_dir'],
        ),
    )

    workflow.transform(
        name='haplotype_allele_readcount',
        ctx={'mem': 20},
        func='remixt.analysis.readcount.haplotype_allele_readcount',
        args=(
            mgd.OutputFile(allele_counts_filename),
            mgd.TempInputFile('segments.tsv'),
            mgd.TempInputFile('tumour.h5', 'cell_id'),
            mgd.InputFile(haplotypes_filename),
            config,
        ),
    )

    info_file = os.path.join(args["out_dir"], 'results', 'infer_haps',
                             "info.yaml")

    results = {
        'infer_haps_allele_counts':
        helpers.format_file_yaml(allele_counts_filename),
        'infer_haps_data':
        helpers.format_file_yaml(haplotypes_filename),
    }

    if args['input_yaml']:
        input_datasets = {
            k: helpers.format_file_yaml(v)
            for k, v in bam_files.iteritems()
        }
    else:
        input_datasets = helpers.format_file_yaml(args['input_bam'])

    metadata = {
        'infer_haps': {
            'version': single_cell.__version__,
            'results': results,
            'containers': config['containers'],
            'input_datasets': input_datasets,
            'output_datasets': None
        }
    }

    workflow.transform(name='generate_meta_yaml',
                       ctx=dict(mem=config['memory']['med'],
                                pool_id=config['pools']['standard'],
                                mem_retry_increment=2,
                                ncpus=1),
                       func="single_cell.utils.helpers.write_to_yaml",
                       args=(mgd.OutputFile(info_file), metadata))

    return workflow