Beispiel #1
0
def call_copynumber(
        samples, config, tumours, normals, breakpoints,
        titan_raw_dir, remixt_results,
        remixt_raw_dir, titan_segments, titan_params, titan_markers
):
    breakpoints = dict([(sampid, breakpoints[sampid])
                        for sampid in samples])
    remixt_results = dict([(sampid, remixt_results[sampid])
                           for sampid in samples])
    titan_segments = dict([(sampid, titan_segments[sampid])
                           for sampid in samples])
    titan_params = dict([(sampid, titan_params[sampid])
                         for sampid in samples])
    titan_markers = dict([(sampid, titan_markers[sampid])
                          for sampid in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments),
            mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params),
            mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    return workflow
Beispiel #2
0
def conversion_workflow(args):
    docker = docker_containers()

    converted_dir = args["out_dir"]

    cell_ids, cfse_images, livedead_images = get_cell_images(
        args['input_yaml'])

    converted_image_template = os.path.join(converted_dir, '{cell_id}.png')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': docker['microscope_image_converter']})

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='convert',
        func='microscope_image_converter.tasks.convert',
        axes=('cell_id', ),
        args=(
            mgd.InputFile('livedead.tif', 'cell_id', fnames=livedead_images),
            mgd.InputFile('cfse.tif', 'cell_id', fnames=cfse_images),
            mgd.OutputFile('converted.png',
                           'cell_id',
                           template=converted_image_template,
                           axes_origin=[]),
        ),
    )

    converted_meta = os.path.join(converted_dir, 'metadata.yaml')
    input_yaml_blob = os.path.join(converted_dir, 'input.yaml')
    workflow.transform(
        name='generate_meta_files_results',
        func='microscope_image_converter.tasks.generate_and_upload_metadata',
        args=(sys.argv[0:], converted_dir,
              mgd.Template('converted.png',
                           'cell_id',
                           template=converted_image_template),
              mgd.OutputFile(converted_meta)),
        kwargs={
            'input_yaml_data': load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'cell_ids': cell_ids,
                'type': 'dlp_microscope_merged',
            }
        })

    return workflow
Beispiel #3
0
def alignment_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])
    outdir = args['out_dir']

    outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    metrics_output = os.path.join(outdir, '{sample_id}',
                                  '{sample_id}_metrics.csv.gz')
    prealignment_tar = os.path.join(outdir, '{sample_id}',
                                    '{sample_id}_fastqc.tar.gz')
    postalignment_tar = os.path.join(outdir, '{sample_id}',
                                     '{sample_id}_metrics.tar.gz')

    samples = list(inputs.keys())
    fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None)

    sample_info = helpers.get_sample_info(inputs)

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('alignment')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(name="prealign",
                         func=pre_alignment.pre_alignment,
                         axes=('sample_id', 'lane_id'),
                         args=(
                             mgd.InputFile('input.r1.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r1),
                             mgd.InputFile('input.r2.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r2),
                             mgd.Template('prealignment.tar',
                                          'sample_id',
                                          template=prealignment_tar),
                         ))

    workflow.subworkflow(
        name="align",
        func=alignment.alignment,
        args=(
            mgd.InputFile('input.r1.fastq.gz',
                          'sample_id',
                          'lane_id',
                          fnames=fastqs_r1,
                          axes_origin=[]),
            mgd.InputFile('input.r2.fastq.gz',
                          'sample_id',
                          'lane_id',
                          fnames=fastqs_r2,
                          axes_origin=[]),
            mgd.OutputFile('output.bam',
                           'sample_id',
                           template=outputs,
                           axes_origin=[]),
            args['refdir'],
            sample_info,
        ),
    )

    workflow.subworkflow(
        name="postalign",
        func=post_alignment.post_alignment,
        axes=('sample_id', ),
        args=(
            mgd.InputFile('output.bam', 'sample_id', template=outputs),
            mgd.OutputFile('metrics.csv.gz',
                           'sample_id',
                           template=metrics_output,
                           extensions=['.yaml']),
            mgd.OutputFile('metrics.tar.gz',
                           'sample_id',
                           template=postalignment_tar),
            mgd.InputInstance('sample_id'),
            args['refdir'],
        ),
    )

    pyp.run(workflow)
Beispiel #4
0
def destruct_multi_sample_workflow(
    normal_bam,
    tumour_bam_files,
    destruct_config,
    config,
    destruct_ref_data_dir,
    breakpoints_csv,
    breakpoints_library_csv,
    cell_counts_csv,
    raw_data_dir,
    normal_sample_id='normal',
):
    ctx = {'docker_image': config['docker']['destruct']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_bam_files.keys()),
    )

    keys = [(sample_id, library_id)
            for (sample_id, library_id, _) in list(tumour_bam_files.keys())]
    keys = sorted(set(keys))

    breakpoints_csv = dict([(key, breakpoints_csv(*key)) for key in keys])
    breakpoints_library_csv = dict([(key, breakpoints_library_csv(*key))
                                    for key in keys])
    cell_counts_csv = dict([(key, cell_counts_csv(*key)) for key in keys])

    workflow.set_filenames('tumour_cells.bam',
                           'sample_id',
                           'library_id',
                           'cell_id',
                           fnames=tumour_bam_files)
    workflow.set_filenames('breakpoints.csv',
                           'sample_id',
                           'library_id',
                           fnames=breakpoints_csv)
    workflow.set_filenames('breakpoints_library.csv',
                           'sample_id',
                           'library_id',
                           fnames=breakpoints_library_csv)
    workflow.set_filenames('cell_counts.csv',
                           'sample_id',
                           'library_id',
                           fnames=cell_counts_csv)

    workflow.subworkflow(
        name='normal_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        args=(
            normal_bam,
            mgd.TempOutputFile('normal_stats'),
            mgd.TempOutputFile('normal_reads_1.fastq.gz'),
            mgd.TempOutputFile('normal_reads_2.fastq.gz'),
            mgd.TempOutputFile('normal_sample_1.fastq.gz'),
            mgd.TempOutputFile('normal_sample_2.fastq.gz'),
            destruct_ref_data_dir,
            destruct_config,
        ),
    )

    workflow.subworkflow(
        name='tumour_preprocess_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.destruct_preprocess_workflow',
        axes=('sample_id', 'library_id'),
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai']),
            mgd.TempOutputFile('tumour_stats', 'sample_id', 'library_id'),
            mgd.TempOutputFile('tumour_reads_1.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_reads_2.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_sample_1.fastq.gz', 'sample_id',
                               'library_id'),
            mgd.TempOutputFile('tumour_sample_2.fastq.gz', 'sample_id',
                               'library_id'),
            destruct_ref_data_dir,
            destruct_config,
        ),
        kwargs={'tag': True})

    workflow.subworkflow(
        name='run_destruct',
        func=
        'single_cell.workflows.destruct_singlecell.create_destruct_workflow',
        axes=('sample_id', 'library_id'),
        args=(
            mgd.TempInputFile('normal_stats'),
            mgd.TempInputFile('normal_reads_1.fastq.gz'),
            mgd.TempInputFile('normal_reads_2.fastq.gz'),
            mgd.TempInputFile('normal_sample_1.fastq.gz'),
            mgd.TempInputFile('normal_sample_2.fastq.gz'),
            mgd.TempInputFile('tumour_stats', 'sample_id', 'library_id'),
            mgd.TempInputFile('tumour_reads_1.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_reads_2.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_sample_1.fastq.gz', 'sample_id',
                              'library_id'),
            mgd.TempInputFile('tumour_sample_2.fastq.gz', 'sample_id',
                              'library_id'),
            destruct_config,
            destruct_ref_data_dir,
            mgd.OutputFile('breakpoints.csv', 'sample_id', 'library_id'),
            mgd.OutputFile('breakpoints_library.csv', 'sample_id',
                           'library_id'),
            mgd.OutputFile('cell_counts.csv', 'sample_id', 'library_id'),
            mgd.Template(raw_data_dir, 'sample_id', 'library_id'),
        ),
        kwargs={
            'tumour_sample_id': mgd.Instance('sample_id'),
            'tumour_library_id': mgd.Instance('library_id'),
            'normal_sample_id': normal_sample_id,
        },
    )

    return workflow
Beispiel #5
0
def cna_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = tumours.keys()

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("target_list", 'sample_id', fnames=targets,
                          axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id',
                           axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id',
                           axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id',
                           axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id',
                          fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id',
                          fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id',
                          axes_origin=[], fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id',
                           axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
Beispiel #6
0
def alignment_workflow(args):
    config = inpututils.load_config(args)
    config = config['alignment']

    lib = args["library_id"]
    alignment_dir = args["out_dir"]
    bams_dir = args["bams_dir"]

    trim = args['trim']
    center = args['sequencing_center']

    sampleinfo = inpututils.get_sample_info(args['input_yaml'])

    cellids = inpututils.get_samples(args['input_yaml'])
    fastq1_files, fastq2_files = inpututils.get_fastqs(args['input_yaml'])

    alignment_files = get_output_files(alignment_dir, lib)
    alignment_meta = os.path.join(alignment_dir, 'metadata.yaml')

    bam_files_template = os.path.join(bams_dir, '{cell_id}.bam')
    mt_bam_files_template = os.path.join(bams_dir, '{cell_id}_MT.bam')
    bams_meta = os.path.join(bams_dir, 'metadata.yaml')

    lanes = sorted(set([v[1] for v in fastq1_files.keys()]))
    cells = sorted(set([v[0] for v in fastq1_files.keys()]))

    input_yaml_blob = os.path.join(alignment_dir, 'input.yaml')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id', 'lane'),
        value=list(fastq1_files.keys()),
    )

    workflow.subworkflow(
        name='alignment_workflow',
        func=align.create_alignment_workflow,
        args=(
            mgd.InputFile('fastq_1',
                          'cell_id',
                          'lane',
                          fnames=fastq1_files,
                          axes_origin=[]),
            mgd.InputFile('fastq_2',
                          'cell_id',
                          'lane',
                          fnames=fastq2_files,
                          axes_origin=[]),
            mgd.OutputFile('bam_markdups',
                           'cell_id',
                           template=bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile('mt_bam_markdups',
                           'cell_id',
                           template=mt_bam_files_template,
                           axes_origin=[],
                           extensions=['.bai']),
            mgd.OutputFile(alignment_files['alignment_metrics_csv']),
            mgd.OutputFile(alignment_files['gc_metrics_csv']),
            mgd.OutputFile(alignment_files['fastqc_metrics_csv']),
            mgd.OutputFile(alignment_files['plot_metrics_output']),
            config['ref_genome'],
            config,
            sampleinfo,
            cellids,
            mgd.OutputFile(alignment_files['alignment_metrics_tar']),
            lib,
            trim,
            center,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], alignment_dir, list(alignment_files.values()),
              mgd.OutputFile(alignment_meta)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'alignment'
            }
        })

    workflow.transform(
        name='generate_meta_files_bams',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], bams_dir,
              mgd.Template('aligned.bam',
                           'cell_id',
                           template=bam_files_template),
              mgd.OutputFile(bams_meta)),
        kwargs={
            'metadata': {
                'library_id': lib,
                'cell_ids': cells,
                'lane_ids': lanes,
                'type': 'cellbams'
            },
            'template':
            (mgd.InputChunks('cell_id'), bam_files_template, 'cell_id'),
        })

    return workflow
Beispiel #7
0
def merge_bams_workflow(args):
    config = inpututils.load_config(args)
    config = config['merge_bams']

    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
        'mem': config["memory"]['low']
    }
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    bam_files = inpututils.load_merge_cell_bams(args['input_yaml'])

    merge_out_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name="get_regions",
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.OutputChunks('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.transform(
        name="remove_softclipped_reads",
        func="single_cell.utils.pysamutils.remove_softclipped_reads",
        axes=('cell_id', ),
        args=(mgd.InputFile('bam_markdups',
                            'cell_id',
                            fnames=bam_files,
                            extensions=['.bai']),
              mgd.TempOutputFile('bam_rm_softclipped.bam',
                                 'cell_id',
                                 extensions=['.bai']),
              args['softclipped_reads_threshold']))

    workflow.subworkflow(name="wgs_merge_workflow",
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.TempInputFile('bam_rm_softclipped.bam',
                                               'cell_id',
                                               extensions=['.bai']),
                             mgd.OutputFile("merged.bam",
                                            "region",
                                            axes_origin=[],
                                            extensions=['.bai'],
                                            template=merge_out_template),
                             mgd.InputChunks("region"),
                             config,
                         ))

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=merge_out_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'template':
            (mgd.InputChunks('region'), merge_out_template, 'region'),
            'metadata': {
                'type': 'pseudowgs_regionbams',
                'cell_ids': list(bam_files.keys())
            }
        })

    return workflow
Beispiel #8
0
def create_topiary_workflow(hla_alleles,
                            in_file,
                            out_file,
                            copy_pyensembl_cache_dir=False,
                            iedb_dir=None,
                            genome='GRCh37',
                            predictor='netmhc',
                            pyensembl_cache_dir=None):
    """ Run topiary.

    Parameters
    ----------
    hla_alleles: list
        List of HLA alleles i.e. A*02:01.
    in_file: str
        Path to VCF file with variants.
    out_file: str
        Path where output will be written in tsv format.
    """
    sandbox = soil.utils.workflow.get_sandbox([
        'topiary',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('raw_hla_alleles'),
                    value=hla_alleles)

    workflow.setobj(obj=mgd.OutputChunks('pep_len'), value=[8, 9, 10, 11])

    workflow.transform(name='filter_hla_alleles',
                       func=tasks.filter_hla_alleles,
                       args=(mgd.TempInputObj('raw_hla_alleles'), ),
                       kwargs={
                           'iedb_dir': iedb_dir,
                           'predictor': predictor,
                       },
                       ret=mgd.TempOutputObj('hla_alleles'))

    workflow.transform(name='run_topiary',
                       axes=('pep_len', ),
                       ctx={
                           'mem': 8,
                           'mem_retry_increment': 4,
                           'num_retry': 3
                       },
                       func=tasks.run_topiary,
                       args=(mgd.TempInputObj('hla_alleles'),
                             mgd.InputFile(in_file),
                             mgd.TempOutputFile('raw.tsv', 'pep_len')),
                       kwargs={
                           'copy_pyensembl_cache_dir':
                           copy_pyensembl_cache_dir,
                           'iedb_dir': iedb_dir,
                           'genome': genome,
                           'peptide_length':
                           mgd.Template('{pep_len}', 'pep_len'),
                           'predictor': predictor,
                           'pyensembl_cache_dir': pyensembl_cache_dir
                       })

    workflow.transform(name='reformat_output',
                       axes=(),
                       func=tasks.reformat_output,
                       args=(mgd.TempInputFile('raw.tsv', 'pep_len'),
                             mgd.OutputFile(out_file)))

    return workflow
Beispiel #9
0
def wgs_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    samples = tumours.keys()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples,
    )

    if args['alignment']:
        tumour_fastqs_r1, tumour_fastqs_r2 = get_fastqs(inputs, samples, 'tumour')
        normal_fastqs_r1, normal_fastqs_r2 = get_fastqs(inputs, samples, 'normal')

        normal_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{norm_sample_id}', '{norm_lane}', 'normal'
        )
        tumour_alignment_template = os.path.join(
            args['out_dir'], 'alignment', '{tum_sample_id}', '{tum_lane}', 'tumour'
        )

        workflow.subworkflow(
            name='wgs_alignment_paired_lanes',
            func=paired_alignment,
            args=(
                config,
                mgd.OutputFile("tumour.bam", 'sample_id', fnames=tumours,
                               extensions=['.bai'], axes_origin=[]),
                mgd.OutputFile("normal.bam", 'sample_id', fnames=normals,
                               extensions=['.bai'], axes_origin=[]),
                samples,
                tumour_fastqs_r1,
                tumour_fastqs_r2,
                normal_fastqs_r1,
                normal_fastqs_r2,
                normal_alignment_template,
                tumour_alignment_template,
            )
        )

    museq_dir = os.path.join(args['out_dir'], 'variants')
    museq_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_paired_annotated.vcf.gz')
    museq_ss_vcf = os.path.join(museq_dir, '{sample_id}', 'museq_single_annotated.vcf.gz')
    strelka_snv_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_snv_annotated.vcf.gz')
    strelka_indel_vcf = os.path.join(museq_dir, '{sample_id}', 'strelka_indel_annotated.vcf.gz')
    parsed_snv_csv = os.path.join(museq_dir, '{sample_id}', 'allcalls.csv')
    museq_paired_pdf = os.path.join(museq_dir, '{sample_id}', 'paired_museqportrait.pdf')
    museq_single_pdf = os.path.join(museq_dir, '{sample_id}', 'single_museqportrait.pdf')
    workflow.subworkflow(
        name='variant_calling',
        func=call_variants,
        args=(
            samples,
            config,
            mgd.OutputFile('parsed_snv_csv', 'sample_id', template=parsed_snv_csv, axes_origin=[]),
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('museq', 'sample_id', template=museq_vcf, axes_origin=[]),
            mgd.OutputFile('museq_ss', 'sample_id', template=museq_ss_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_snv', 'sample_id', template=strelka_snv_vcf, axes_origin=[]),
            mgd.OutputFile('strelka_indel', 'sample_id', template=strelka_indel_vcf, axes_origin=[]),
            mgd.OutputFile('museq_paired_pdf', 'sample_id', template=museq_paired_pdf, axes_origin=[]),
            mgd.OutputFile('museq_single_pdf', 'sample_id', template=museq_single_pdf, axes_origin=[]),
        )
    )

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(sv_outdir, 'destruct_breakpoints.csv')
    destruct_library = os.path.join(sv_outdir, 'destruct_library.csv')
    destruct_raw_breakpoints = os.path.join(sv_outdir, 'destruct_raw_breakpoints.csv')
    destruct_raw_library = os.path.join(sv_outdir, 'destruct_raw_library.csv')
    destruct_reads = os.path.join(sv_outdir, 'destruct_reads.csv')
    lumpy_vcf = os.path.join(sv_outdir, 'lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir, 'filtered_consensus_calls.csv')
    workflow.subworkflow(
        name="call_breakpoints",
        func=call_breakpoints,
        args=(
            samples,
            config,
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.OutputFile('destruct_raw_breakpoints', 'sample_id', template=destruct_raw_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_raw_library', 'sample_id', template=destruct_raw_library, axes_origin=[]),
            mgd.OutputFile('destruct_breakpoints', 'sample_id', template=destruct_breakpoints, axes_origin=[]),
            mgd.OutputFile('destruct_library', 'sample_id', template=destruct_library, axes_origin=[]),
            mgd.OutputFile('destruct_reads', 'sample_id', template=destruct_reads, axes_origin=[]),
            mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf, axes_origin=[]),
            mgd.OutputFile('parsed_csv', 'sample_id', template=parsed_csv, axes_origin=[])
        )
    )

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')
    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')
    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile("target_list", 'sample_id', fnames=targets, axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id', axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id', axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id', axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )
    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour.bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal.bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id', axes_origin=[], template=destruct_breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id', axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
def split_bam_workflow(args):
    config = inpututils.load_config(args)
    config = config['split_bam']

    bam_file = inpututils.load_split_wgs_input(args['input_yaml'])

    baseimage = config['docker']['single_cell_pipeline']

    split_bam_template = os.path.join(args['out_dir'], '{region}.bam')

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    workflow = pypeliner.workflow.Workflow(ctx={'docker_image': baseimage})

    workflow.transform(
        name="get_regions",
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.utils.pysamutils.get_regions_from_reference",
        ret=pypeliner.managed.TempOutputObj('region'),
        args=(
            config["ref_genome"],
            config["split_size"],
            config["chromosomes"],
        ))

    workflow.subworkflow(
        name="split_normal",
        func=split_bams.create_split_workflow,
        ctx={
            'mem': config['memory']['low'],
            'ncpus': 1
        },
        args=(
            mgd.InputFile(bam_file),
            mgd.OutputFile("normal.split.bam",
                           'region',
                           template=split_bam_template,
                           axes_origin=[]),
            pypeliner.managed.TempInputObj('region'),
            config,
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('bam_filenames',
                           'region',
                           template=split_bam_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'wgs_regionbams'
            },
            'template':
            (mgd.TempInputObj('region'), split_bam_template, 'region'),
        })

    return workflow
Beispiel #11
0
def paired_alignment(config, tumours, normals, samples, tumour_fastqs_r1,
                     tumour_fastqs_r2, normal_fastqs_r1, normal_fastqs_r2,
                     outdir_template_normal, outdir_template_tumour):
    tumours = dict([(sampid, tumours[sampid]) for sampid in samples])
    normals = dict([(sampid, normals[sampid]) for sampid in samples])
    tumours_index = dict([(sampid, tumours[sampid] + '.bai')
                          for sampid in samples])
    normals_index = dict([(sampid, normals[sampid] + '.bai')
                          for sampid in samples])

    workflow = pypeliner.workflow.Workflow()

    global_config = config['globals']
    config = config['alignment']

    workflow.setobj(
        obj=mgd.OutputChunks('tum_sample_id', 'tum_lane'),
        value=tumour_fastqs_r1.keys(),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('norm_sample_id', 'norm_lane'),
        value=normal_fastqs_r1.keys(),
    )

    workflow.subworkflow(
        name='align_tumours',
        func=alignment.align_sample,
        axes=('tum_sample_id', 'tum_lane'),
        args=(config,
              mgd.InputFile('input.r1.fastq.gz',
                            'tum_sample_id',
                            'tum_lane',
                            fnames=tumour_fastqs_r1),
              mgd.InputFile('input.r2.fastq.gz',
                            'tum_sample_id',
                            'tum_lane',
                            fnames=tumour_fastqs_r2),
              mgd.TempOutputFile('tumour.bam', 'tum_sample_id', 'tum_lane'),
              mgd.Template(outdir_template_tumour, 'tum_sample_id',
                           'tum_lane'), [
                               mgd.InputInstance('tum_sample_id'),
                               mgd.InputInstance('tum_lane')
                           ]),
    )

    workflow.transform(name='merge_tumour_lanes',
                       ctx={
                           'mem': global_config['memory']['med'],
                           'ncpus': 1
                       },
                       func="wgs.workflows.alignment.tasks.merge_bams",
                       axes=('tum_sample_id', ),
                       args=(mgd.TempInputFile('tumour.bam', 'tum_sample_id',
                                               'tum_lane'),
                             mgd.OutputFile('output.bam',
                                            'tum_sample_id',
                                            fnames=tumours),
                             mgd.OutputFile('output.bam.bai',
                                            'tum_sample_id',
                                            fnames=tumours_index), None))

    workflow.subworkflow(
        name='align_normals',
        func=alignment.align_sample,
        axes=('norm_sample_id', 'norm_lane'),
        args=(config,
              mgd.InputFile('input.r1.fastq.gz',
                            'norm_sample_id',
                            'norm_lane',
                            fnames=normal_fastqs_r1),
              mgd.InputFile('input.r2.fastq.gz',
                            'norm_sample_id',
                            'norm_lane',
                            fnames=normal_fastqs_r2),
              mgd.TempOutputFile('normal.bam', 'norm_sample_id', 'norm_lane'),
              mgd.Template(outdir_template_normal, 'norm_sample_id',
                           'norm_lane'), [
                               mgd.InputInstance('norm_sample_id'),
                               mgd.InputInstance('norm_lane')
                           ]),
    )

    workflow.transform(name='merge_normal_lanes',
                       ctx={
                           'mem': global_config['memory']['med'],
                           'ncpus': 1
                       },
                       func="wgs.workflows.alignment.tasks.merge_bams",
                       axes=('norm_sample_id', ),
                       args=(mgd.TempInputFile('normal.bam', 'norm_sample_id',
                                               'norm_lane'),
                             mgd.OutputFile('output.bam',
                                            'norm_sample_id',
                                            fnames=normals),
                             mgd.OutputFile('output.bam.bai',
                                            'norm_sample_id',
                                            fnames=normals_index), None))

    return workflow
def variant_calling_multi_sample_workflow(
        config, normal_wgs_bam, tumour_cell_bams, varcall_dir, museq_vcf,
        strelka_snvs, strelka_indels, museq_csv, strelka_csv, cosmic_csv,
        dbsnp_csv, mappability_csv, snpeff_csv, trinuc_csv, snv_counts):
    keys = [(sample_id, library_id)
            for (sample_id, library_id, _) in list(tumour_cell_bams.keys())]
    keys = sorted(set(keys))

    museq_vcf = dict([(key, museq_vcf(*key)) for key in keys])

    strelka_snvs = dict([(key, strelka_snvs(*key)) for key in keys])
    strelka_indels = dict([(key, strelka_indels(*key)) for key in keys])

    museq_csv = dict([(key, museq_csv(*key)) for key in keys])
    strelka_csv = dict([(key, strelka_csv(*key)) for key in keys])

    cosmic_csv = dict([(key, cosmic_csv(*key)) for key in keys])
    dbsnp_csv = dict([(key, dbsnp_csv(*key)) for key in keys])
    mappability_csv = dict([(key, mappability_csv(*key)) for key in keys])
    snpeff_csv = dict([(key, snpeff_csv(*key)) for key in keys])
    trinuc_csv = dict([(key, trinuc_csv(*key)) for key in keys])

    snv_counts = dict([(key, snv_counts(*key)) for key in keys])

    variant_calling_raw_data_template = os.path.join(
        varcall_dir, 'variant_calling_rawdata',
        '{sample_id}_{library_id}_variant_calling')
    normal_region_bam_template = os.path.join(varcall_dir,
                                              'normal_region_bams',
                                              'normal_{region}.bam')
    tumour_region_bam_template = os.path.join(
        varcall_dir, 'tumour_region_bams',
        '{sample_id}_{library_id}_{region}.bam')

    vcftools_image = {
        'docker_image': config['variant_calling']['docker']['vcftools']
    }

    workflow = pypeliner.workflow.Workflow(default_ctx={
        'docker_image':
        config['multi_sample']['docker']['single_cell_pipeline']
    })

    workflow.transform(name='get_regions',
                       ret=mgd.TempOutputObj("get_regions"),
                       func=refgenome.get_split_regions,
                       args=(config["split_bam"]["split_size"],
                             config["split_bam"]["ref_genome"]))

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=mgd.TempInputObj('get_regions'),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'region'),
        axes=(
            'sample_id',
            'library_id',
        ),
        value=mgd.TempInputObj('get_regions'),
    )

    if isinstance(normal_wgs_bam, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('normal_cell_id'),
            value=list(normal_wgs_bam.keys()),
        )
        workflow.set_filenames('normal_cells.bam',
                               'normal_cell_id',
                               fnames=normal_wgs_bam)
        workflow.subworkflow(name="merge_normal_cells",
                             func=merge_bams.create_merge_bams_workflow,
                             args=(
                                 mgd.InputFile('normal_cells.bam',
                                               'normal_cell_id',
                                               extensions=['.bai']),
                                 mgd.OutputFile(
                                     'normal_regions.bam',
                                     'region',
                                     axes_origin=[],
                                     extensions=['.bai'],
                                     template=normal_region_bam_template),
                                 mgd.TempInputObj('get_regions'),
                                 config['merge_bams'],
                             ))
    else:
        workflow.subworkflow(name="split_normal",
                             func=split_bams.create_split_workflow,
                             args=(
                                 mgd.InputFile(normal_wgs_bam,
                                               extensions=['.bai']),
                                 mgd.OutputFile(
                                     'normal_regions.bam',
                                     'region',
                                     extensions=['.bai'],
                                     axes_origin=[],
                                     template=normal_region_bam_template),
                                 pypeliner.managed.TempInputObj('region'),
                                 config['split_bam'],
                             ),
                             kwargs={"by_reads": False})

    workflow.subworkflow(name="split_merge_tumour",
                         axes=(
                             'sample_id',
                             'library_id',
                         ),
                         func=merge_bams.create_merge_bams_workflow,
                         args=(
                             mgd.InputFile('tumour_all_cells.bam',
                                           'sample_id',
                                           'library_id',
                                           'cell_id',
                                           fnames=tumour_cell_bams,
                                           extensions=['.bai'],
                                           axes_origin=[]),
                             mgd.OutputFile(
                                 'tumour_regions.bam',
                                 'sample_id',
                                 'library_id',
                                 'region',
                                 axes_origin=[],
                                 extensions=['.bai'],
                                 template=tumour_region_bam_template),
                             mgd.TempInputObj('get_regions'),
                             config['merge_bams'],
                         ))

    workflow.subworkflow(
        name='variant_calling',
        func=create_variant_calling_workflow,
        axes=(
            'sample_id',
            'library_id',
        ),
        args=(
            mgd.InputFile('tumour_all_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams),
            mgd.InputFile('tumour_regions.bam',
                          'sample_id',
                          'library_id',
                          'region',
                          extensions=['.bai'],
                          template=tumour_region_bam_template),
            mgd.InputFile('normal_regions.bam',
                          'region',
                          extensions=['.bai'],
                          template=normal_region_bam_template),
            mgd.OutputFile('museq.vcf',
                           'sample_id',
                           'library_id',
                           extensions=['.tbi', '.csi'],
                           fnames=museq_vcf),
            mgd.OutputFile('strelka_snv.vcf',
                           'sample_id',
                           'library_id',
                           extensions=['.tbi', '.csi'],
                           fnames=strelka_snvs),
            mgd.OutputFile('strelka_indel.vcf',
                           'sample_id',
                           'library_id',
                           extensions=['.tbi', '.csi'],
                           fnames=strelka_indels),
            mgd.OutputFile('museq.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=museq_csv),
            mgd.OutputFile('strelka.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=strelka_csv),
            mgd.OutputFile('cosmic.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=cosmic_csv),
            mgd.OutputFile('dbsnp.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=dbsnp_csv),
            mgd.OutputFile('mappability.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=mappability_csv),
            mgd.OutputFile('snpeff.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=snpeff_csv),
            mgd.OutputFile('trinuc.csv.gz',
                           'sample_id',
                           'library_id',
                           extensions=['.yaml'],
                           fnames=trinuc_csv),
            config['variant_calling'],
            mgd.Template(variant_calling_raw_data_template, 'sample_id',
                         'library_id'),
        ),
    )

    workflow.transform(
        name='merge_museq_snvs',
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.InputFile('museq.vcf',
                          'sample_id',
                          'library_id',
                          axes_origin=[],
                          extensions=['.tbi', '.csi'],
                          fnames=museq_vcf),
            mgd.TempOutputFile('museq.vcf.gz', extensions=['.tbi', '.csi']),
        ),
        kwargs={
            'allow_overlap': True,
            'docker_config': vcftools_image,
        },
    )

    workflow.transform(
        name='merge_strelka_snvs',
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.InputFile('strelka_snv.vcf',
                          'sample_id',
                          'library_id',
                          axes_origin=[],
                          extensions=['.tbi', '.csi'],
                          fnames=strelka_snvs),
            mgd.TempOutputFile('strelka_snv.vcf.gz',
                               extensions=['.tbi', '.csi']),
        ),
        kwargs={
            'allow_overlap': True,
            'docker_config': vcftools_image,
        },
    )

    workflow.subworkflow(
        name='variant_counting',
        func=create_variant_counting_workflow,
        axes=(
            'sample_id',
            'library_id',
        ),
        args=(
            [
                mgd.TempInputFile('museq.vcf.gz', extensions=['.tbi', '.csi']),
                mgd.TempInputFile('strelka_snv.vcf.gz',
                                  extensions=['.tbi', '.csi']),
            ],
            mgd.InputFile('tumour_all_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams),
            mgd.OutputFile('snv_counts.csv.gz',
                           'sample_id',
                           'library_id',
                           fnames=snv_counts),
            config['variant_calling'],
        ),
    )

    return workflow
Beispiel #13
0
def alignment_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])
    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    outputs_tdf = os.path.join(outdir, '{sample_id}', '{sample_id}.bam.tdf')
    metrics_output = os.path.join(outdir, '{sample_id}',
                                  '{sample_id}_metrics.csv')
    metrics_tar = os.path.join(outdir, '{sample_id}',
                               '{sample_id}_metrics.tar.gz')

    samples = list(inputs.keys())
    fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None)

    sample_info = helpers.get_sample_info(inputs)

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(name="align_samples",
                         func=alignment.align_samples,
                         args=(mgd.InputFile('input.r1.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r1),
                               mgd.InputFile('input.r2.fastq.gz',
                                             'sample_id',
                                             'lane_id',
                                             fnames=fastqs_r2),
                               mgd.Template('output.bam',
                                            'sample_id',
                                            template=outputs),
                               mgd.Template('metrics.txt',
                                            'sample_id',
                                            template=metrics_output),
                               mgd.Template('metrics.tar',
                                            'sample_id',
                                            template=metrics_tar),
                               mgd.Template('output.bam.tdf',
                                            'sample_id',
                                            template=outputs_tdf), sample_info,
                               args['refdir']),
                         kwargs={
                             'single_node': args['single_node'],
                             'picard_mem': args['picard_mem']
                         })

    outputted_filenames = helpers.expand_list([
        outputs,
        outputs_tdf,
        metrics_output,
        metrics_tar,
    ], samples, "sample_id")
    workflow.transform(name='generate_meta_files_results',
                       func='wgs.utils.helpers.generate_and_upload_metadata',
                       args=(sys.argv[0:], outdir, outputted_filenames,
                             mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data': inputs,
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'alignment'
                           }
                       })

    pyp.run(workflow)
Beispiel #14
0
def create_variant_counting_workflow(args):
    """ Count variant reads for multiple sets of variants across cells.
    """

    vcf_files, tumour_cell_bams, sample_library = inpututils.load_variant_counting_input(
        args['input_yaml'])

    counts_template = '{sample_id}_{library_id}_counts.csv.gz'
    counts_output_template = os.path.join(args['out_dir'], counts_template)

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    config = inpututils.load_config(args)
    config = config['variant_calling']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'library_id', 'cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.transform(
        name='merge_snvs_museq',
        func='single_cell.utils.vcfutils.merge_vcf',
        args=([mgd.InputFile(vcf_file) for vcf_file in vcf_files],
              mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi',
                                                               '.csi']),
              mgd.TempSpace("merge_vcf_temp")),
    )

    workflow.subworkflow(
        name='count_alleles',
        axes=('sample_id', 'library_id'),
        func=
        'single_cell.workflows.snv_allele_counts.create_snv_allele_counts_for_vcf_targets_workflow',
        args=(
            mgd.InputFile('tumour_cells.bam',
                          'sample_id',
                          'library_id',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams,
                          axes_origin=[]),
            mgd.TempInputFile('all.snv.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
            mgd.Instance('sample_id'),
            mgd.Instance('library_id'),
            config['memory'],
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(sys.argv[0:], args['out_dir'],
              mgd.Template('counts.csv.gz',
                           'sample_id',
                           'library_id',
                           template=counts_output_template),
              mgd.OutputFile(meta_yaml)),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {
                'type': 'snv_genotyping',
                'counts': {
                    'template': counts_template,
                    'instances': sample_library,
                }
            }
        })

    return workflow
Beispiel #15
0
def copynumber_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    run_titan = args['titan']
    run_remixt = args['remixt']

    if not run_titan and not run_remixt:
        run_titan = True
        run_remixt = True

    inputs = helpers.load_yaml(args['input_yaml'])

    outdir = args['out_dir']
    meta_yaml = os.path.join(outdir, 'metadata.yaml')
    input_yaml_blob = os.path.join(outdir, 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = list(tumours.keys())

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')

    titan_outfile = os.path.join(titan_raw_dir, '{sample_id}_titan_markers.csv.gz')
    titan_params = os.path.join(titan_raw_dir, '{sample_id}_titan_params.csv.gz')
    titan_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_segs.csv.gz')
    titan_igv_segs = os.path.join(titan_raw_dir, '{sample_id}_titan_igv_segs.seg')
    titan_parsed = os.path.join(titan_raw_dir, '{sample_id}_titan_parsed.csv.gz')
    titan_plots = os.path.join(titan_raw_dir, '{sample_id}_titan_plots.pdf')
    titan_tar_outputs = os.path.join(titan_raw_dir, '{sample_id}_data_all_parameters.tar.gz')
    museq_vcf = os.path.join(titan_raw_dir, '{sample_id}_museq.vcf')

    remixt_outdir = os.path.join(args['out_dir'], 'remixt', '{sample_id}')
    remixt_outfile = os.path.join(remixt_outdir, '{sample_id}_remixt.h5')
    remixt_raw_dir = os.path.join(remixt_outdir, '{sample_id}_raw_dir')

    remixt_brk_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_brk_cn.csv.gz')
    remixt_cn_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_cn.csv.gz')
    remixt_minor_modes_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_minor_modes.csv.gz')
    remixt_mix_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_mix.csv.gz')
    remixt_read_depth_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_read_depth.csv.gz')
    remixt_stats_csv = os.path.join(remixt_outdir, '{sample_id}_remixt_stats.csv.gz')

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    if run_remixt:
        workflow.subworkflow(
            name='remixt',
            func=remixt.create_remixt_workflow,
            axes=('sample_id',),
            args=(
                mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                              extensions=['.bai']),
                mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                              extensions=['.bai']),
                mgd.InputFile("breakpoints", 'sample_id', fnames=breakpoints),
                mgd.InputInstance('sample_id'),
                mgd.OutputFile('remixt.h5', 'sample_id', template=remixt_outfile),
                mgd.OutputFile('remixt_brk_cn.csv', 'sample_id', template=remixt_brk_cn_csv),
                mgd.OutputFile('remixt_cn.csv', 'sample_id', template=remixt_cn_csv),
                mgd.OutputFile('remixt_minor_modes.csv', 'sample_id', template=remixt_minor_modes_csv),
                mgd.OutputFile('remixt_mix.csv', 'sample_id', template=remixt_mix_csv),
                mgd.OutputFile('remixt_read_depth.csv', 'sample_id', template=remixt_read_depth_csv),
                mgd.OutputFile('remixt_stats.csv', 'sample_id', template=remixt_stats_csv),
                refdir_paths['refdata_remixt'],
                mgd.Template('rawdir', 'sample_id', template=remixt_raw_dir),
                refdir_paths['reference'],
            ),
            kwargs={'single_node': args['single_node']}
        )

    if run_titan:
        workflow.subworkflow(
            name='titan',
            func=titan.create_titan_workflow,
            axes=('sample_id',),
            args=(
                mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                              extensions=['.bai']),
                mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                              extensions=['.bai']),
                mgd.InputFile("target_list", 'sample_id', fnames=targets),
                mgd.OutputFile('outfile', 'sample_id', template=titan_outfile),
                mgd.OutputFile('params', 'sample_id', template=titan_params),
                mgd.OutputFile('segs', 'sample_id', template=titan_segs),
                mgd.OutputFile('igv_segs', 'sample_id', template=titan_igv_segs),
                mgd.OutputFile('parsed', 'sample_id', template=titan_parsed),
                mgd.OutputFile('plots', 'sample_id', template=titan_plots),
                mgd.OutputFile('tar_outputs', 'sample_id', template=titan_tar_outputs),
                mgd.OutputFile('museq.vcf', 'sample_id', template=museq_vcf),
                mgd.InputInstance('sample_id'),
                refdir_paths['reference'],
                chromosomes,
                refdir_paths['het_positions_titan'],
                refdir_paths['map_wig'],
                refdir_paths['gc_wig'],
                refdir_paths['gtf'],
            ),
            kwargs={'single_node': args['single_node']}
        )

    filenames = []

    if run_remixt:
        filenames += [
            remixt_outfile,
            remixt_raw_dir,
            remixt_brk_cn_csv,
            remixt_cn_csv,
            remixt_minor_modes_csv,
            remixt_mix_csv,
            remixt_read_depth_csv,
            remixt_stats_csv
        ]
    if run_titan:
        filenames += [
            titan_outfile,
            titan_params,
            titan_segs,
            titan_igv_segs,
            titan_parsed,
            titan_plots,
            titan_tar_outputs,
            museq_vcf,
        ]

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(
        name='generate_meta_files_results',
        func='wgs.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args["out_dir"],
            outputted_filenames,
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': helpers.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'copynumber_calling'}
        }
    )

    pyp.run(workflow)